shahbazdev0 commited on
Commit
f7db2f9
·
verified ·
1 Parent(s): de86471

Upload 9 files

Browse files
Files changed (8) hide show
  1. .gitignore +12 -0
  2. README.md +2 -2
  3. app.py +915 -0
  4. create_sample_dataset.py +1228 -0
  5. evaluation.py +413 -0
  6. graph_manager.py +156 -0
  7. utils.py +131 -0
  8. version_rag.py +477 -0
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paste this content:
2
+ .env
3
+ *.pyc
4
+ __pycache__/
5
+ chroma_db_*/
6
+ chroma_baseline_*/
7
+ user_data_*/
8
+ sample_data/
9
+ venv/
10
+ .venv/
11
+ *.log
12
+ .DS_Store
README.md CHANGED
@@ -4,8 +4,8 @@ emoji: 📚
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: streamlit
7
- sdk_version: 1.51.0
8
- app_file: src/streamlit_app.py
9
  pinned: false
10
  ---
11
 
 
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: streamlit
7
+ sdk_version: 1.28.0
8
+ app_file: app.py
9
  pinned: false
10
  ---
11
 
app.py ADDED
@@ -0,0 +1,915 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # app.py - Main Streamlit Application
3
+ import streamlit as st
4
+ import os
5
+ import json
6
+ import hashlib
7
+ import time
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ import pandas as pd
11
+ import plotly.graph_objects as go
12
+ import plotly.express as px
13
+ from typing import List, Dict, Optional, Tuple
14
+ import uuid
15
+
16
+ # Import custom modules
17
+ from version_rag import VersionRAG, BaselineRAG
18
+ from graph_manager import GraphManager
19
+ from evaluation import Evaluator, VersionQADataset
20
+ from utils import DocumentProcessor, ChangeDetector, PersistentStorage
21
+
22
+ # Page configuration
23
+ st.set_page_config(
24
+ page_title="VersionRAG - Version-Aware RAG System",
25
+ page_icon="📚",
26
+ layout="wide",
27
+ initial_sidebar_state="expanded"
28
+ )
29
+
30
+ # Initialize session state
31
+ def init_session_state():
32
+ if 'user_id' not in st.session_state:
33
+ st.session_state.user_id = str(uuid.uuid4())
34
+ if 'version_rag' not in st.session_state:
35
+ st.session_state.version_rag = None
36
+ if 'baseline_rag' not in st.session_state:
37
+ st.session_state.baseline_rag = None
38
+ if 'graph_manager' not in st.session_state:
39
+ st.session_state.graph_manager = None
40
+ if 'uploaded_files' not in st.session_state:
41
+ st.session_state.uploaded_files = {}
42
+ if 'chat_history' not in st.session_state:
43
+ st.session_state.chat_history = []
44
+ if 'evaluation_results' not in st.session_state:
45
+ st.session_state.evaluation_results = None
46
+ if 'feedback_data' not in st.session_state:
47
+ st.session_state.feedback_data = []
48
+ if 'persistent_storage' not in st.session_state:
49
+ st.session_state.persistent_storage = None
50
+
51
+ init_session_state()
52
+
53
+ # Custom CSS
54
+ st.markdown("""
55
+ <style>
56
+ .main-header {
57
+ font-size: 2.5rem;
58
+ font-weight: bold;
59
+ color: #1f77b4;
60
+ text-align: center;
61
+ padding: 1rem 0;
62
+ }
63
+ .metric-card {
64
+ background-color: #f0f2f6;
65
+ padding: 1rem;
66
+ border-radius: 0.5rem;
67
+ margin: 0.5rem 0;
68
+ }
69
+ .diff-added {
70
+ background-color: #d4edda;
71
+ padding: 0.2rem 0.5rem;
72
+ border-radius: 0.3rem;
73
+ }
74
+ .diff-removed {
75
+ background-color: #f8d7da;
76
+ padding: 0.2rem 0.5rem;
77
+ border-radius: 0.3rem;
78
+ }
79
+ .version-tag {
80
+ background-color: #e7f3ff;
81
+ color: #0366d6;
82
+ padding: 0.2rem 0.5rem;
83
+ border-radius: 0.3rem;
84
+ font-weight: bold;
85
+ }
86
+ .stTabs [data-baseweb="tab-list"] {
87
+ gap: 2rem;
88
+ }
89
+ </style>
90
+ """, unsafe_allow_html=True)
91
+
92
+ # Sidebar
93
+ with st.sidebar:
94
+ st.markdown("### 🔐 User Session")
95
+ st.info(f"User ID: {st.session_state.user_id[:8]}...")
96
+
97
+ st.markdown("### ⚙️ Settings")
98
+
99
+ # API Key input
100
+ api_key = st.text_input("OpenAI API Key", type="password",
101
+ value=os.getenv("OPENAI_API_KEY", ""))
102
+ if api_key:
103
+ os.environ["OPENAI_API_KEY"] = api_key
104
+
105
+ # Model selection
106
+ model_name = st.selectbox(
107
+ "LLM Model",
108
+ ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo-preview"],
109
+ index=0
110
+ )
111
+
112
+ # Embedding model
113
+ embedding_model = st.selectbox(
114
+ "Embedding Model",
115
+ ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"], # ✅ CORRECT
116
+ index=0
117
+ )
118
+
119
+ # Retrieval parameters
120
+ st.markdown("### 🎯 Retrieval Parameters")
121
+ top_k = st.slider("Top K Results", 1, 10, 5)
122
+ similarity_threshold = st.slider("Similarity Threshold", 0.0, 1.0, 0.7)
123
+
124
+ # Initialize systems button
125
+ if st.button("🚀 Initialize Systems", type="primary"):
126
+ with st.spinner("Initializing VersionRAG and Baseline systems..."):
127
+ try:
128
+ st.session_state.version_rag = VersionRAG(
129
+ user_id=st.session_state.user_id,
130
+ model_name=model_name,
131
+ embedding_model=embedding_model
132
+ )
133
+ st.session_state.baseline_rag = BaselineRAG(
134
+ user_id=st.session_state.user_id,
135
+ model_name=model_name,
136
+ embedding_model=embedding_model
137
+ )
138
+ st.session_state.graph_manager = GraphManager(
139
+ user_id=st.session_state.user_id
140
+ )
141
+ st.success("✅ Systems initialized successfully!")
142
+ except Exception as e:
143
+ st.error(f"❌ Initialization error: {str(e)}")
144
+
145
+ # Knowledge base status
146
+ if st.session_state.uploaded_files:
147
+ st.markdown("### 📚 Knowledge Base")
148
+ for filename, info in st.session_state.uploaded_files.items():
149
+ with st.expander(f"📄 {filename}"):
150
+ st.write(f"**Version:** {info['version']}")
151
+ st.write(f"**Uploaded:** {info['timestamp']}")
152
+ st.write(f"**Hash:** {info['hash'][:12]}...")
153
+
154
+ # Main content
155
+ st.markdown('<div class="main-header">📚 VersionRAG: Version-Aware RAG System</div>',
156
+ unsafe_allow_html=True)
157
+
158
+ # Create tabs
159
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
160
+ "📤 Document Upload",
161
+ "💬 Query Interface",
162
+ "📊 Evaluation",
163
+ "🔍 Version Explorer",
164
+ "📈 Analytics",
165
+ "👥 Multi-User Management"
166
+ ])
167
+
168
+ # Tab 1: Document Upload
169
+ with tab1:
170
+ st.header("Document Upload & Indexing")
171
+
172
+ col1, col2 = st.columns([2, 1])
173
+
174
+ with col1:
175
+ uploaded_files = st.file_uploader(
176
+ "Upload versioned documents (PDF, TXT)",
177
+ type=["pdf", "txt"],
178
+ accept_multiple_files=True
179
+ )
180
+
181
+ if uploaded_files:
182
+ st.markdown("### 📋 File Metadata")
183
+ for idx, file in enumerate(uploaded_files):
184
+ with st.expander(f"📄 {file.name}", expanded=True):
185
+ col_a, col_b = st.columns(2)
186
+ with col_a:
187
+ version = st.text_input(
188
+ "Version",
189
+ key=f"version_{idx}",
190
+ value="1.0.0"
191
+ )
192
+ with col_b:
193
+ domain = st.selectbox(
194
+ "Domain",
195
+ ["Software", "Healthcare", "Finance", "Industrial", "Other"],
196
+ key=f"domain_{idx}"
197
+ )
198
+
199
+ topic = st.text_input(
200
+ "Topic/Module",
201
+ key=f"topic_{idx}",
202
+ value=file.name.split('.')[0]
203
+ )
204
+
205
+ if st.button(f"Process {file.name}", key=f"process_{idx}"):
206
+ if not st.session_state.version_rag:
207
+ st.error("Please initialize systems first!")
208
+ else:
209
+ with st.spinner(f"Processing {file.name}..."):
210
+ try:
211
+ # Read file content
212
+ content = file.read()
213
+ if file.type == "application/pdf":
214
+ text = DocumentProcessor.extract_text_from_pdf(content)
215
+ else:
216
+ text = content.decode('utf-8')
217
+
218
+ # Calculate hash
219
+ file_hash = hashlib.sha256(content).hexdigest()
220
+
221
+ # Check if file already exists
222
+ if file.name in st.session_state.uploaded_files:
223
+ old_hash = st.session_state.uploaded_files[file.name]['hash']
224
+ if old_hash == file_hash:
225
+ st.info("File unchanged, skipping indexing.")
226
+ continue
227
+ else:
228
+ st.info("File changed, re-indexing with diff analysis...")
229
+ # Perform diff analysis
230
+ old_text = st.session_state.uploaded_files[file.name]['text']
231
+ changes = ChangeDetector.compute_diff(old_text, text)
232
+
233
+ # Add to graph
234
+ st.session_state.graph_manager.add_version_with_changes(
235
+ document_name=topic,
236
+ version=version,
237
+ changes=changes
238
+ )
239
+
240
+ # Add to VersionRAG
241
+ st.session_state.version_rag.add_documents(
242
+ texts=[text],
243
+ metadatas=[{
244
+ 'filename': file.name,
245
+ 'version': version,
246
+ 'domain': domain,
247
+ 'topic': topic,
248
+ 'hash': file_hash,
249
+ 'timestamp': datetime.now().isoformat()
250
+ }]
251
+ )
252
+
253
+ # Add to Baseline RAG
254
+ st.session_state.baseline_rag.add_documents(
255
+ texts=[text],
256
+ metadatas=[{
257
+ 'filename': file.name,
258
+ 'version': version
259
+ }]
260
+ )
261
+
262
+ # Add to graph
263
+ st.session_state.graph_manager.add_document_version(
264
+ document_name=topic,
265
+ version=version,
266
+ content=text,
267
+ metadata={
268
+ 'domain': domain,
269
+ 'filename': file.name
270
+ }
271
+ )
272
+
273
+ # Store in session state
274
+ st.session_state.uploaded_files[file.name] = {
275
+ 'version': version,
276
+ 'domain': domain,
277
+ 'topic': topic,
278
+ 'hash': file_hash,
279
+ 'text': text,
280
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
281
+ }
282
+
283
+ st.success(f"✅ Successfully processed {file.name}")
284
+
285
+ except Exception as e:
286
+ st.error(f"❌ Error processing {file.name}: {str(e)}")
287
+
288
+ with col2:
289
+ st.markdown("### 📊 Upload Statistics")
290
+ if st.session_state.uploaded_files:
291
+ stats_data = {
292
+ 'Total Files': len(st.session_state.uploaded_files),
293
+ 'Domains': len(set(f['domain'] for f in st.session_state.uploaded_files.values())),
294
+ 'Total Versions': len(set(f['version'] for f in st.session_state.uploaded_files.values()))
295
+ }
296
+
297
+ for key, value in stats_data.items():
298
+ st.metric(key, value)
299
+
300
+ # Domain distribution
301
+ domain_counts = {}
302
+ for file_info in st.session_state.uploaded_files.values():
303
+ domain = file_info['domain']
304
+ domain_counts[domain] = domain_counts.get(domain, 0) + 1
305
+
306
+ fig = px.pie(
307
+ values=list(domain_counts.values()),
308
+ names=list(domain_counts.keys()),
309
+ title="Documents by Domain"
310
+ )
311
+ st.plotly_chart(fig, use_container_width=True)
312
+
313
+ # Tab 2: Query Interface
314
+ with tab2:
315
+ st.header("Interactive Query Interface")
316
+
317
+ if not st.session_state.version_rag:
318
+ st.warning("⚠️ Please initialize the systems first from the sidebar!")
319
+ else:
320
+ # Query type selection
321
+ query_type = st.radio(
322
+ "Query Type",
323
+ ["Content Retrieval", "Version Inquiry", "Change Retrieval"],
324
+ horizontal=True
325
+ )
326
+
327
+ # Query input
328
+ col1, col2 = st.columns([3, 1])
329
+ with col1:
330
+ query = st.text_input(
331
+ "Enter your query",
332
+ placeholder="e.g., What is the assert module in Node.js v20.0?"
333
+ )
334
+
335
+ with col2:
336
+ compare_mode = st.checkbox("Compare with Baseline", value=True)
337
+
338
+ # Version filter (for content retrieval)
339
+ if query_type == "Content Retrieval":
340
+ version_filter = st.text_input(
341
+ "Version Filter (optional)",
342
+ placeholder="e.g., 1.2.0"
343
+ )
344
+ else:
345
+ version_filter = None
346
+
347
+ if st.button("🔍 Search", type="primary"):
348
+ if not query:
349
+ st.warning("Please enter a query!")
350
+ else:
351
+ with st.spinner("Searching..."):
352
+ start_time = time.time()
353
+
354
+ # VersionRAG query
355
+ if query_type == "Content Retrieval":
356
+ vrag_result = st.session_state.version_rag.query(
357
+ query=query,
358
+ version_filter=version_filter,
359
+ top_k=top_k
360
+ )
361
+ elif query_type == "Version Inquiry":
362
+ vrag_result = st.session_state.version_rag.version_inquiry(
363
+ query=query
364
+ )
365
+ else: # Change Retrieval
366
+ vrag_result = st.session_state.version_rag.change_retrieval(
367
+ query=query
368
+ )
369
+
370
+ vrag_time = time.time() - start_time
371
+
372
+ # Baseline query (if comparison enabled)
373
+ if compare_mode:
374
+ start_time = time.time()
375
+ baseline_result = st.session_state.baseline_rag.query(
376
+ query=query,
377
+ top_k=top_k
378
+ )
379
+ baseline_time = time.time() - start_time
380
+
381
+ # Display results
382
+ if compare_mode:
383
+ col1, col2 = st.columns(2)
384
+
385
+ with col1:
386
+ st.markdown("### 🚀 VersionRAG Response")
387
+ st.markdown(f"**Response Time:** {vrag_time:.3f}s")
388
+ st.markdown("---")
389
+ st.markdown(vrag_result['answer'])
390
+
391
+ if 'sources' in vrag_result:
392
+ with st.expander("📚 Sources"):
393
+ for idx, source in enumerate(vrag_result['sources']):
394
+ st.markdown(f"**Source {idx+1}**")
395
+ st.markdown(f"- Version: `{source.get('version', 'N/A')}`")
396
+ st.markdown(f"- File: `{source.get('filename', 'N/A')}`")
397
+ st.markdown(f"- Similarity: {source.get('similarity', 0):.3f}")
398
+ st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
399
+
400
+ with col2:
401
+ st.markdown("### 📊 Baseline RAG Response")
402
+ st.markdown(f"**Response Time:** {baseline_time:.3f}s")
403
+ st.markdown("---")
404
+ st.markdown(baseline_result['answer'])
405
+
406
+ if 'sources' in baseline_result:
407
+ with st.expander("📚 Sources"):
408
+ for idx, source in enumerate(baseline_result['sources']):
409
+ st.markdown(f"**Source {idx+1}**")
410
+ st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
411
+ else:
412
+ st.markdown("### 🚀 VersionRAG Response")
413
+ st.markdown(f"**Response Time:** {vrag_time:.3f}s")
414
+ st.markdown("---")
415
+ st.markdown(vrag_result['answer'])
416
+
417
+ if 'sources' in vrag_result:
418
+ with st.expander("📚 Sources"):
419
+ for idx, source in enumerate(vrag_result['sources']):
420
+ st.markdown(f"**Source {idx+1}**")
421
+ st.markdown(f"- Version: `{source.get('version', 'N/A')}`")
422
+ st.markdown(f"- File: `{source.get('filename', 'N/A')}`")
423
+ st.markdown(f"- Similarity: {source.get('similarity', 0):.3f}")
424
+ st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
425
+
426
+ # Feedback
427
+ st.markdown("### 📝 Feedback")
428
+ col1, col2, col3 = st.columns([1, 1, 2])
429
+ with col1:
430
+ rating = st.slider("Rate this answer", 1, 5, 3)
431
+ with col2:
432
+ if st.button("Submit Feedback"):
433
+ st.session_state.feedback_data.append({
434
+ 'query': query,
435
+ 'query_type': query_type,
436
+ 'rating': rating,
437
+ 'timestamp': datetime.now().isoformat(),
438
+ 'response_time': vrag_time
439
+ })
440
+ st.success("Thank you for your feedback!")
441
+
442
+ # Add to chat history
443
+ st.session_state.chat_history.append({
444
+ 'query': query,
445
+ 'query_type': query_type,
446
+ 'vrag_answer': vrag_result['answer'],
447
+ 'vrag_time': vrag_time,
448
+ 'baseline_answer': baseline_result['answer'] if compare_mode else None,
449
+ 'baseline_time': baseline_time if compare_mode else None,
450
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
451
+ })
452
+
453
+ # Chat history
454
+ if st.session_state.chat_history:
455
+ st.markdown("### 💭 Query History")
456
+ for idx, chat in enumerate(reversed(st.session_state.chat_history[-5:])):
457
+ with st.expander(f"{chat['timestamp']} - {chat['query'][:50]}..."):
458
+ st.markdown(f"**Query Type:** {chat['query_type']}")
459
+ st.markdown(f"**VersionRAG Answer:** {chat['vrag_answer'][:200]}...")
460
+ st.markdown(f"**Response Time:** {chat['vrag_time']:.3f}s")
461
+
462
+ # Tab 3: Evaluation
463
+ with tab3:
464
+ st.header("System Evaluation")
465
+
466
+ if not st.session_state.version_rag:
467
+ st.warning("⚠️ Please initialize the systems first!")
468
+ else:
469
+ st.markdown("""
470
+ This section evaluates VersionRAG against the baseline system using the Mini-VersionQA dataset.
471
+ Metrics include Hit@k, MRR, Accuracy, and Version-Sensitive Accuracy (VSA).
472
+ """)
473
+
474
+ # Evaluation dataset configuration
475
+ st.markdown("### 📋 Evaluation Dataset Configuration")
476
+
477
+ use_custom_dataset = st.checkbox("Use custom evaluation dataset")
478
+
479
+ if use_custom_dataset:
480
+ uploaded_qa_file = st.file_uploader(
481
+ "Upload QA Dataset (JSON)",
482
+ type=["json"]
483
+ )
484
+ if uploaded_qa_file:
485
+ qa_data = json.load(uploaded_qa_file)
486
+ st.success(f"Loaded {len(qa_data)} questions")
487
+ else:
488
+ st.info("Using default Mini-VersionQA dataset")
489
+ qa_data = None
490
+
491
+ if st.button("🚀 Run Evaluation", type="primary"):
492
+ with st.spinner("Running evaluation..."):
493
+ try:
494
+ # Initialize evaluator
495
+ evaluator = Evaluator(
496
+ version_rag=st.session_state.version_rag,
497
+ baseline_rag=st.session_state.baseline_rag
498
+ )
499
+
500
+ # Create or load dataset
501
+ if qa_data:
502
+ dataset = VersionQADataset.from_dict(qa_data)
503
+ else:
504
+ dataset = VersionQADataset.create_mini_versionqa()
505
+
506
+ # Run evaluation
507
+ results = evaluator.evaluate(dataset)
508
+ st.session_state.evaluation_results = results
509
+
510
+ # Display results
511
+ st.markdown("### 📊 Evaluation Results")
512
+
513
+ # Overall comparison
514
+ col1, col2 = st.columns(2)
515
+
516
+ with col1:
517
+ st.markdown("#### 🚀 VersionRAG")
518
+ st.metric("Accuracy", f"{results['versionrag']['accuracy']:.2%}")
519
+ st.metric("Hit@5", f"{results['versionrag']['hit_at_5']:.2%}")
520
+ st.metric("MRR", f"{results['versionrag']['mrr']:.3f}")
521
+ st.metric("VSA", f"{results['versionrag']['vsa']:.2%}")
522
+ st.metric("Avg Latency", f"{results['versionrag']['avg_latency']:.3f}s")
523
+
524
+ with col2:
525
+ st.markdown("#### 📊 Baseline RAG")
526
+ st.metric("Accuracy", f"{results['baseline']['accuracy']:.2%}")
527
+ st.metric("Hit@5", f"{results['baseline']['hit_at_5']:.2%}")
528
+ st.metric("MRR", f"{results['baseline']['mrr']:.3f}")
529
+ st.metric("VSA", f"{results['baseline']['vsa']:.2%}")
530
+ st.metric("Avg Latency", f"{results['baseline']['avg_latency']:.3f}s")
531
+
532
+ # Performance improvement
533
+ st.markdown("### 📈 Performance Improvement")
534
+ improvement = {
535
+ 'Accuracy': (results['versionrag']['accuracy'] - results['baseline']['accuracy']) * 100,
536
+ 'Hit@5': (results['versionrag']['hit_at_5'] - results['baseline']['hit_at_5']) * 100,
537
+ 'MRR': (results['versionrag']['mrr'] - results['baseline']['mrr']) * 100,
538
+ 'VSA': (results['versionrag']['vsa'] - results['baseline']['vsa']) * 100
539
+ }
540
+
541
+ fig = go.Figure(data=[
542
+ go.Bar(name='Improvement', x=list(improvement.keys()),
543
+ y=list(improvement.values()),
544
+ marker_color='lightblue')
545
+ ])
546
+ fig.add_hline(y=25, line_dash="dash", line_color="red",
547
+ annotation_text="Target: 25 points")
548
+ fig.update_layout(
549
+ title="VersionRAG vs Baseline - Performance Improvement (percentage points)",
550
+ yaxis_title="Improvement (%)",
551
+ showlegend=False
552
+ )
553
+ st.plotly_chart(fig, use_container_width=True)
554
+
555
+ # Query type breakdown
556
+ st.markdown("### 🔍 Performance by Query Type")
557
+
558
+ query_types = ['Content Retrieval', 'Version Inquiry', 'Change Retrieval']
559
+ vrag_scores = [
560
+ results['versionrag']['by_type']['content_retrieval'],
561
+ results['versionrag']['by_type']['version_inquiry'],
562
+ results['versionrag']['by_type']['change_retrieval']
563
+ ]
564
+ baseline_scores = [
565
+ results['baseline']['by_type']['content_retrieval'],
566
+ results['baseline']['by_type']['version_inquiry'],
567
+ results['baseline']['by_type']['change_retrieval']
568
+ ]
569
+
570
+ fig = go.Figure(data=[
571
+ go.Bar(name='VersionRAG', x=query_types, y=vrag_scores),
572
+ go.Bar(name='Baseline', x=query_types, y=baseline_scores)
573
+ ])
574
+ fig.update_layout(
575
+ title="Accuracy by Query Type",
576
+ yaxis_title="Accuracy (%)",
577
+ barmode='group'
578
+ )
579
+ st.plotly_chart(fig, use_container_width=True)
580
+
581
+ # Success criteria check
582
+ st.markdown("### ✅ Success Criteria")
583
+ criteria = {
584
+ 'VSA Improvement ≥ 25 points': improvement['VSA'] >= 25,
585
+ 'Content Retrieval ≥ 85%': vrag_scores[0] >= 85,
586
+ 'Version Inquiry ≥ 90%': vrag_scores[1] >= 90,
587
+ 'Change Retrieval ≥ 60%': vrag_scores[2] >= 60
588
+ }
589
+
590
+ for criterion, passed in criteria.items():
591
+ if passed:
592
+ st.success(f"✅ {criterion}")
593
+ else:
594
+ st.error(f"❌ {criterion}")
595
+
596
+ except Exception as e:
597
+ st.error(f"Evaluation error: {str(e)}")
598
+
599
+ # Tab 4: Version Explorer
600
+ with tab4:
601
+ st.header("Version Explorer")
602
+
603
+ if not st.session_state.graph_manager:
604
+ st.warning("⚠️ Please initialize the systems first!")
605
+ else:
606
+ # Document selection
607
+ documents = st.session_state.graph_manager.get_all_documents()
608
+
609
+ if not documents:
610
+ st.info("No documents uploaded yet. Please upload documents in the 'Document Upload' tab.")
611
+ else:
612
+ selected_doc = st.selectbox("Select Document", documents)
613
+
614
+ if selected_doc:
615
+ # Get versions for selected document
616
+ versions = st.session_state.graph_manager.get_document_versions(selected_doc)
617
+
618
+ st.markdown(f"### 📚 {selected_doc}")
619
+ st.markdown(f"**Total Versions:** {len(versions)}")
620
+
621
+ # Version timeline
622
+ if len(versions) > 1:
623
+ st.markdown("### 📅 Version Timeline")
624
+ timeline_data = []
625
+ for v in sorted(versions):
626
+ version_info = st.session_state.graph_manager.get_version_info(
627
+ selected_doc, v
628
+ )
629
+ timeline_data.append({
630
+ 'Version': v,
631
+ 'Date': version_info.get('timestamp', 'N/A')
632
+ })
633
+
634
+ df = pd.DataFrame(timeline_data)
635
+ st.dataframe(df, use_container_width=True)
636
+
637
+ # Version comparison
638
+ st.markdown("### 🔄 Version Comparison")
639
+ col1, col2 = st.columns(2)
640
+
641
+ with col1:
642
+ version1 = st.selectbox("Version 1", sorted(versions), index=0)
643
+ with col2:
644
+ version2 = st.selectbox("Version 2", sorted(versions),
645
+ index=min(1, len(versions)-1))
646
+
647
+ if version1 and version2 and version1 != version2:
648
+ if st.button("Compare Versions"):
649
+ with st.spinner("Computing differences..."):
650
+ changes = st.session_state.graph_manager.get_changes_between_versions(
651
+ selected_doc, version1, version2
652
+ )
653
+
654
+ st.markdown("### 📝 Changes Detected")
655
+
656
+ if changes['additions']:
657
+ st.markdown("#### ➕ Additions")
658
+ for add in changes['additions']:
659
+ st.markdown(f'<div class="diff-added">{add}</div>',
660
+ unsafe_allow_html=True)
661
+
662
+ if changes['deletions']:
663
+ st.markdown("#### ➖ Deletions")
664
+ for delete in changes['deletions']:
665
+ st.markdown(f'<div class="diff-removed">{delete}</div>',
666
+ unsafe_allow_html=True)
667
+
668
+ if changes['modifications']:
669
+ st.markdown("#### 🔄 Modifications")
670
+ for mod in changes['modifications']:
671
+ st.markdown(f"- {mod}")
672
+
673
+ # Visualize changes
674
+ st.markdown("### 📊 Change Statistics")
675
+ change_stats = {
676
+ 'Additions': len(changes['additions']),
677
+ 'Deletions': len(changes['deletions']),
678
+ 'Modifications': len(changes['modifications'])
679
+ }
680
+
681
+ fig = px.bar(
682
+ x=list(change_stats.keys()),
683
+ y=list(change_stats.values()),
684
+ title=f"Changes from {version1} to {version2}",
685
+ labels={'x': 'Change Type', 'y': 'Count'}
686
+ )
687
+ st.plotly_chart(fig, use_container_width=True)
688
+
689
+ # Tab 5: Analytics
690
+ with tab5:
691
+ st.header("System Analytics")
692
+
693
+ # System statistics
694
+ col1, col2, col3, col4 = st.columns(4)
695
+
696
+ with col1:
697
+ st.metric("Total Queries", len(st.session_state.chat_history))
698
+ with col2:
699
+ if st.session_state.feedback_data:
700
+ avg_rating = sum(f['rating'] for f in st.session_state.feedback_data) / len(st.session_state.feedback_data)
701
+ st.metric("Avg Rating", f"{avg_rating:.2f} / 5")
702
+ else:
703
+ st.metric("Avg Rating", "N/A")
704
+ with col3:
705
+ if st.session_state.chat_history:
706
+ avg_response_time = sum(c['vrag_time'] for c in st.session_state.chat_history) / len(st.session_state.chat_history)
707
+ st.metric("Avg Response Time", f"{avg_response_time:.3f}s")
708
+ else:
709
+ st.metric("Avg Response Time", "N/A")
710
+ with col4:
711
+ st.metric("Total Documents", len(st.session_state.uploaded_files))
712
+
713
+ # Query type distribution
714
+ if st.session_state.chat_history:
715
+ st.markdown("### 📊 Query Type Distribution")
716
+ query_type_counts = {}
717
+ for chat in st.session_state.chat_history:
718
+ qtype = chat['query_type']
719
+ query_type_counts[qtype] = query_type_counts.get(qtype, 0) + 1
720
+
721
+ fig = px.pie(
722
+ values=list(query_type_counts.values()),
723
+ names=list(query_type_counts.keys()),
724
+ title="Distribution of Query Types"
725
+ )
726
+ st.plotly_chart(fig, use_container_width=True)
727
+
728
+ # Response time trend
729
+ if len(st.session_state.chat_history) > 1:
730
+ st.markdown("### ⏱️ Response Time Trend")
731
+ times = [c['vrag_time'] for c in st.session_state.chat_history]
732
+ fig = go.Figure(data=go.Scatter(
733
+ y=times,
734
+ mode='lines+markers',
735
+ name='Response Time'
736
+ ))
737
+ fig.update_layout(
738
+ title="Response Time Over Queries",
739
+ xaxis_title="Query Number",
740
+ yaxis_title="Response Time (s)"
741
+ )
742
+ st.plotly_chart(fig, use_container_width=True)
743
+
744
+ # Feedback analysis
745
+ if st.session_state.feedback_data:
746
+ st.markdown("### 📝 User Feedback Analysis")
747
+
748
+ # Rating distribution
749
+ rating_counts = {}
750
+ for feedback in st.session_state.feedback_data:
751
+ rating = feedback['rating']
752
+ rating_counts[rating] = rating_counts.get(rating, 0) + 1
753
+
754
+ fig = go.Figure(data=[
755
+ go.Bar(x=list(rating_counts.keys()), y=list(rating_counts.values()))
756
+ ])
757
+ fig.update_layout(
758
+ title="Rating Distribution",
759
+ xaxis_title="Rating",
760
+ yaxis_title="Count"
761
+ )
762
+ st.plotly_chart(fig, use_container_width=True)
763
+
764
+ # Export analytics
765
+ st.markdown("### 💾 Export Data")
766
+ col1, col2 = st.columns(2)
767
+
768
+ with col1:
769
+ if st.button("Export Chat History"):
770
+ if st.session_state.chat_history:
771
+ df = pd.DataFrame(st.session_state.chat_history)
772
+ csv = df.to_csv(index=False)
773
+ st.download_button(
774
+ "Download CSV",
775
+ csv,
776
+ "chat_history.csv",
777
+ "text/csv"
778
+ )
779
+
780
+ with col2:
781
+ if st.button("Export Feedback Data"):
782
+ if st.session_state.feedback_data:
783
+ df = pd.DataFrame(st.session_state.feedback_data)
784
+ csv = df.to_csv(index=False)
785
+ st.download_button(
786
+ "Download CSV",
787
+ csv,
788
+ "feedback_data.csv",
789
+ "text/csv"
790
+ )
791
+
792
+ # Tab 6: Multi-User Management
793
+ with tab6:
794
+ st.header("Multi-User Management")
795
+
796
+ st.markdown("""
797
+ This section demonstrates VersionRAG's multi-user capabilities with logical data separation
798
+ and persistent knowledge base management.
799
+ """)
800
+
801
+ # User session info
802
+ st.markdown("### 👤 Current Session")
803
+ col1, col2, col3 = st.columns(3)
804
+
805
+ with col1:
806
+ st.info(f"**User ID:** {st.session_state.user_id[:16]}...")
807
+ with col2:
808
+ st.info(f"**Documents:** {len(st.session_state.uploaded_files)}")
809
+ with col3:
810
+ st.info(f"**Queries:** {len(st.session_state.chat_history)}")
811
+
812
+ # Data isolation demonstration
813
+ st.markdown("### 🔒 Data Isolation")
814
+ st.markdown("""
815
+ Each user's knowledge base is logically separated using `tenant_id` metadata in ChromaDB.
816
+ This ensures:
817
+ - No data leakage between users
818
+ - Independent query results
819
+ - Isolated document management
820
+ """)
821
+
822
+ # Knowledge base status
823
+ st.markdown("### 📚 Knowledge Base Status")
824
+
825
+ if st.session_state.uploaded_files:
826
+ kb_data = []
827
+ for filename, info in st.session_state.uploaded_files.items():
828
+ kb_data.append({
829
+ 'File': filename,
830
+ 'Version': info['version'],
831
+ 'Domain': info['domain'],
832
+ 'Topic': info['topic'],
833
+ 'Uploaded': info['timestamp'],
834
+ 'Hash': info['hash'][:12] + "..."
835
+ })
836
+
837
+ df = pd.DataFrame(kb_data)
838
+ st.dataframe(df, use_container_width=True)
839
+
840
+ # Persistent storage info
841
+ st.success("""
842
+ ✅ **Persistent Storage Active**
843
+ - All documents are stored with file hash tracking
844
+ - Unchanged files skip re-indexing
845
+ - Automatic diff-based updates for modified files
846
+ """)
847
+ else:
848
+ st.info("No documents in knowledge base. Upload documents to get started.")
849
+
850
+ # Session management
851
+ st.markdown("### 🔄 Session Management")
852
+
853
+ col1, col2 = st.columns(2)
854
+
855
+ with col1:
856
+ if st.button("🆕 Create New Session"):
857
+ if st.checkbox("Confirm session reset"):
858
+ st.session_state.user_id = str(uuid.uuid4())
859
+ st.session_state.version_rag = None
860
+ st.session_state.baseline_rag = None
861
+ st.session_state.graph_manager = None
862
+ st.session_state.uploaded_files = {}
863
+ st.session_state.chat_history = []
864
+ st.success("New session created!")
865
+ st.rerun()
866
+
867
+ with col2:
868
+ if st.button("💾 Export Session Data"):
869
+ session_data = {
870
+ 'user_id': st.session_state.user_id,
871
+ 'uploaded_files': st.session_state.uploaded_files,
872
+ 'chat_history': st.session_state.chat_history,
873
+ 'feedback_data': st.session_state.feedback_data,
874
+ 'timestamp': datetime.now().isoformat()
875
+ }
876
+
877
+ json_str = json.dumps(session_data, indent=2)
878
+ st.download_button(
879
+ "Download Session JSON",
880
+ json_str,
881
+ f"session_{st.session_state.user_id[:8]}.json",
882
+ "application/json"
883
+ )
884
+
885
+ # UX Metrics
886
+ st.markdown("### 📊 UX Metrics")
887
+
888
+ col1, col2, col3 = st.columns(3)
889
+
890
+ with col1:
891
+ # Calculate reupload count (files with same name but different hash)
892
+ reupload_count = 0
893
+ st.metric("Reupload Count", reupload_count,
894
+ help="Number of times files were reuploaded")
895
+
896
+ with col2:
897
+ if st.session_state.chat_history:
898
+ avg_response = sum(c['vrag_time'] for c in st.session_state.chat_history) / len(st.session_state.chat_history)
899
+ st.metric("Avg Response Time", f"{avg_response:.3f}s")
900
+ else:
901
+ st.metric("Avg Response Time", "N/A")
902
+
903
+ with col3:
904
+ cross_contamination = 0 # This would be detected in production
905
+ st.metric("Cross-User Contamination", cross_contamination,
906
+ help="Number of cross-user data leakage incidents")
907
+
908
+ # Footer
909
+ st.markdown("---")
910
+ st.markdown("""
911
+ <div style='text-align: center; color: #666;'>
912
+ <p>VersionRAG - Version-Aware Retrieval-Augmented Generation System</p>
913
+ <p>Built with Streamlit, LangChain, and ChromaDB</p>
914
+ </div>
915
+ """, unsafe_allow_html=True)
create_sample_dataset.py ADDED
@@ -0,0 +1,1228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # create_sample_dataset.py - Generate Sample Documents for Mini-VersionQA
2
+ import os
3
+
4
+ SAMPLE_DOCS = {
5
+ # Node.js Assert - 3 versions
6
+ "nodejs_assert_v20.0.txt": """# Node.js Assert Module v20.0
7
+
8
+ The assert module provides a set of assertion functions for testing invariants in your code.
9
+
10
+ ## Overview
11
+ The assert module is used for writing tests. It provides functions to verify that your code is working as expected.
12
+
13
+ ## Basic Usage
14
+ ```javascript
15
+ const assert = require('assert');
16
+
17
+ // Strict equality assertion
18
+ assert.strictEqual(1, 1);
19
+
20
+ // Deep equality assertion
21
+ assert.deepStrictEqual({a: 1}, {a: 1});
22
+ ```
23
+
24
+ ## Available Functions
25
+ - assert.ok(value): Tests if value is truthy
26
+ - assert.strictEqual(actual, expected): Tests strict equality
27
+ - assert.deepStrictEqual(actual, expected): Tests deep equality
28
+ - assert.notStrictEqual(actual, expected): Tests strict inequality
29
+ - assert.throws(fn): Tests if function throws an error
30
+
31
+ ## Error Messages
32
+ When assertions fail, the assert module provides detailed error messages showing:
33
+ - The actual value received
34
+ - The expected value
35
+ - The assertion type that failed
36
+
37
+ Version: v20.0
38
+ Released: 2023-04
39
+ """,
40
+
41
+ "nodejs_assert_v21.0.txt": """# Node.js Assert Module v21.0
42
+
43
+ The assert module provides a set of assertion functions for testing invariants in your code.
44
+
45
+ ## Overview
46
+ The assert module is used for writing tests. It provides functions to verify that your code is working as expected.
47
+
48
+ ## NEW in v21.0: Strict Mode
49
+ **MAJOR ADDITION**: The assert module now includes a strict mode by default!
50
+
51
+ ```javascript
52
+ const assert = require('assert').strict;
53
+
54
+ // All assertions now use strict equality by default
55
+ assert.equal(1, 1); // Now uses strictEqual internally
56
+ ```
57
+
58
+ ## Basic Usage
59
+ ```javascript
60
+ const assert = require('assert');
61
+
62
+ // Strict equality assertion
63
+ assert.strictEqual(1, 1);
64
+
65
+ // Deep equality assertion
66
+ assert.deepStrictEqual({a: 1}, {a: 1});
67
+ ```
68
+
69
+ ## Available Functions
70
+ - assert.ok(value): Tests if value is truthy
71
+ - assert.strictEqual(actual, expected): Tests strict equality
72
+ - assert.deepStrictEqual(actual, expected): Tests deep equality
73
+ - assert.notStrictEqual(actual, expected): Tests strict inequality
74
+ - assert.throws(fn): Tests if function throws an error
75
+ - **NEW**: assert.rejects(promise): Tests if promise rejects (async support)
76
+ - **NEW**: assert.strict: Strict mode enabled by default
77
+
78
+ ## Strict Mode Benefits
79
+ - Prevents common mistakes with type coercion
80
+ - Enforces strict equality checks
81
+ - Better error messages for mismatched types
82
+
83
+ ## Error Messages
84
+ When assertions fail, the assert module provides detailed error messages showing:
85
+ - The actual value received
86
+ - The expected value
87
+ - The assertion type that failed
88
+ - Stack trace for debugging
89
+
90
+ Version: v21.0
91
+ Released: 2023-10
92
+ """,
93
+
94
+ "nodejs_assert_v23.0.txt": """# Node.js Assert Module v23.0
95
+
96
+ The assert module provides a set of assertion functions for testing invariants in your code.
97
+
98
+ ## Overview
99
+ The assert module is used for writing tests. It provides functions to verify that your code is working as expected.
100
+
101
+ ## Strict Mode (Added in v21.0)
102
+ The assert module includes a strict mode by default:
103
+
104
+ ```javascript
105
+ const assert = require('assert').strict;
106
+
107
+ // All assertions now use strict equality by default
108
+ assert.equal(1, 1); // Uses strictEqual internally
109
+ ```
110
+
111
+ ## NEW in v23.0: Enhanced Diff Output
112
+ **MAJOR IMPROVEMENT**: Better visualization of differences in failed assertions!
113
+
114
+ ```javascript
115
+ // Now shows colored diff output for complex objects
116
+ assert.deepStrictEqual(
117
+ { user: { name: 'John', age: 30 } },
118
+ { user: { name: 'Jane', age: 30 } }
119
+ );
120
+ // Output shows highlighted differences with + and - markers
121
+ ```
122
+
123
+ ## Available Functions
124
+ - assert.ok(value): Tests if value is truthy
125
+ - assert.strictEqual(actual, expected): Tests strict equality
126
+ - assert.deepStrictEqual(actual, expected): Tests deep equality with enhanced diff
127
+ - assert.notStrictEqual(actual, expected): Tests strict inequality
128
+ - assert.throws(fn): Tests if function throws an error
129
+ - assert.rejects(promise): Tests if promise rejects (async support)
130
+ - assert.strict: Strict mode enabled by default
131
+ - **NEW**: assert.match(string, regexp): Tests string against regexp
132
+ - **NEW**: assert.snapshot(value, snapshot): Snapshot testing support
133
+
134
+ ## Strict Mode Benefits
135
+ - Prevents common mistakes with type coercion
136
+ - Enforces strict equality checks
137
+ - Better error messages for mismatched types
138
+
139
+ ## Enhanced Error Messages (v23.0)
140
+ - Color-coded diff output
141
+ - Side-by-side comparison for objects
142
+ - Detailed stack traces with source maps
143
+ - Performance metrics for failed assertions
144
+
145
+ Version: v23.0
146
+ Released: 2024-04
147
+ """,
148
+
149
+ # Bootstrap - 2 versions
150
+ "bootstrap_v5.2.txt": """# Bootstrap v5.2 Documentation
151
+
152
+ ## Grid System
153
+
154
+ Bootstrap includes a powerful mobile-first flexbox grid system for building layouts of all shapes and sizes.
155
+
156
+ ### Grid Classes
157
+ The grid system uses a series of containers, rows, and columns to layout and align content.
158
+
159
+ #### Container Classes
160
+ - `.container`: Fixed-width container
161
+ - `.container-fluid`: Full-width container
162
+ - `.container-{breakpoint}`: Responsive container
163
+
164
+ #### Row Classes
165
+ - `.row`: Creates a grid row
166
+ - `.row-cols-*`: Set number of columns
167
+
168
+ #### Column Classes
169
+ - `.col`: Equal-width columns
170
+ - `.col-{breakpoint}`: Responsive columns
171
+ - `.col-{number}`: Sized columns (1-12)
172
+ - `.col-{breakpoint}-{number}`: Responsive sized columns
173
+
174
+ ### Responsive Breakpoints
175
+ - xs: <576px
176
+ - sm: ≥576px
177
+ - md: ≥768px
178
+ - lg: ≥992px
179
+ - xl: ≥1200px
180
+ - xxl: ≥1400px
181
+
182
+ ### Example Usage
183
+ ```html
184
+ <div class="container">
185
+ <div class="row">
186
+ <div class="col-md-4">Column 1</div>
187
+ <div class="col-md-4">Column 2</div>
188
+ <div class="col-md-4">Column 3</div>
189
+ </div>
190
+ </div>
191
+ ```
192
+
193
+ ### Grid Gutters
194
+ - `.g-*`: Gutter spacing (0-5)
195
+ - `.gx-*`: Horizontal gutters
196
+ - `.gy-*`: Vertical gutters
197
+
198
+ Version: v5.2
199
+ Released: 2022-07
200
+ """,
201
+
202
+ "bootstrap_v5.3.txt": """# Bootstrap v5.3 Documentation
203
+
204
+ ## Grid System
205
+
206
+ Bootstrap includes a powerful mobile-first flexbox grid system for building layouts of all shapes and sizes.
207
+
208
+ ### Grid Classes
209
+ The grid system uses a series of containers, rows, and columns to layout and align content.
210
+
211
+ #### Container Classes
212
+ - `.container`: Fixed-width container
213
+ - `.container-fluid`: Full-width container
214
+ - `.container-{breakpoint}`: Responsive container
215
+
216
+ #### Row Classes
217
+ - `.row`: Creates a grid row
218
+ - `.row-cols-*`: Set number of columns
219
+
220
+ #### Column Classes
221
+ - `.col`: Equal-width columns
222
+ - `.col-{breakpoint}`: Responsive columns
223
+ - `.col-{number}`: Sized columns (1-12)
224
+ - `.col-{breakpoint}-{number}`: Responsive sized columns
225
+
226
+ ### Responsive Breakpoints
227
+ - xs: <576px
228
+ - sm: ≥576px
229
+ - md: ≥768px
230
+ - lg: ≥992px
231
+ - xl: ≥1200px
232
+ - xxl: ≥1400px
233
+
234
+ ### Example Usage
235
+ ```html
236
+ <div class="container">
237
+ <div class="row">
238
+ <div class="col-md-4">Column 1</div>
239
+ <div class="col-md-4">Column 2</div>
240
+ <div class="col-md-4">Column 3</div>
241
+ </div>
242
+ </div>
243
+ ```
244
+
245
+ ### Grid Gutters
246
+ - `.g-*`: Gutter spacing (0-5)
247
+ - `.gx-*`: Horizontal gutters
248
+ - `.gy-*`: Vertical gutters
249
+
250
+ ## NEW in v5.3: Utility Classes
251
+
252
+ ### Extended Color Utilities
253
+ **ADDITION**: New color utility classes for more granular control:
254
+ - `.text-primary-emphasis`
255
+ - `.text-secondary-emphasis`
256
+ - `.bg-primary-subtle`
257
+ - `.bg-secondary-subtle`
258
+ - `.border-primary-subtle`
259
+
260
+ ### Extended Spacing Utilities
261
+ **ADDITION**: New spacing utilities:
262
+ - `.p-*`: Padding (now includes half-step increments)
263
+ - `.m-*`: Margin (now includes half-step increments)
264
+ - Example: `.p-2-5` for padding of 0.625rem
265
+
266
+ ### Focus Ring Utilities
267
+ **NEW FEATURE**: Custom focus ring utilities:
268
+ - `.focus-ring`
269
+ - `.focus-ring-{color}`
270
+ - Provides accessible focus indicators
271
+
272
+ ### Link Utilities
273
+ **IMPROVEMENT**: Enhanced link utilities:
274
+ - `.link-opacity-*`: Control link opacity (10-100)
275
+ - `.link-underline-opacity-*`: Control underline opacity
276
+ - Better accessibility for link states
277
+
278
+ ### Example New Utilities
279
+ ```html
280
+ <div class="bg-primary-subtle p-2-5">
281
+ <a href="#" class="link-opacity-75">Accessible Link</a>
282
+ </div>
283
+ ```
284
+
285
+ Version: v5.3
286
+ Released: 2023-05
287
+ Changes from v5.2: Added emphasis colors, extended spacing, focus ring utilities, enhanced link controls
288
+ """,
289
+
290
+ # Spark - 2 versions
291
+ "spark_v3.0.txt": """# Apache Spark v3.0 Documentation
292
+
293
+ ## DataFrame API
294
+
295
+ DataFrames are distributed collections of data organized into named columns, conceptually equivalent to tables in relational databases.
296
+
297
+ ### Creating DataFrames
298
+
299
+ ```python
300
+ from pyspark.sql import SparkSession
301
+
302
+ spark = SparkSession.builder.appName("example").getOrCreate()
303
+
304
+ # From list
305
+ df = spark.createDataFrame([(1, "John"), (2, "Jane")], ["id", "name"])
306
+
307
+ # From RDD
308
+ rdd = spark.sparkContext.parallelize([(1, "John"), (2, "Jane")])
309
+ df = spark.createDataFrame(rdd, ["id", "name"])
310
+
311
+ # From file
312
+ df = spark.read.csv("data.csv", header=True, inferSchema=True)
313
+ ```
314
+
315
+ ### DataFrame Operations
316
+
317
+ #### Select
318
+ ```python
319
+ df.select("name").show()
320
+ df.select(df["name"], df["id"] + 1).show()
321
+ ```
322
+
323
+ #### Filter
324
+ ```python
325
+ df.filter(df["id"] > 1).show()
326
+ df.where(df["name"] == "John").show()
327
+ ```
328
+
329
+ #### GroupBy
330
+ ```python
331
+ df.groupBy("name").count().show()
332
+ df.groupBy("department").agg({"salary": "avg"}).show()
333
+ ```
334
+
335
+ #### Join
336
+ ```python
337
+ df1.join(df2, df1["id"] == df2["id"], "inner").show()
338
+ ```
339
+
340
+ ### Schema Definition
341
+ ```python
342
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType
343
+
344
+ schema = StructType([
345
+ StructField("id", IntegerType(), True),
346
+ StructField("name", StringType(), True)
347
+ ])
348
+
349
+ df = spark.createDataFrame(data, schema)
350
+ ```
351
+
352
+ ### Data Types
353
+ - IntegerType, LongType, FloatType, DoubleType
354
+ - StringType, BinaryType
355
+ - BooleanType
356
+ - DateType, TimestampType
357
+ - ArrayType, MapType, StructType
358
+
359
+ Version: v3.0
360
+ Released: 2020-06
361
+ """,
362
+
363
+ "spark_v3.5.txt": """# Apache Spark v3.5 Documentation
364
+
365
+ ## DataFrame API
366
+
367
+ DataFrames are distributed collections of data organized into named columns, conceptually equivalent to tables in relational databases.
368
+
369
+ ### Creating DataFrames
370
+
371
+ ```python
372
+ from pyspark.sql import SparkSession
373
+
374
+ spark = SparkSession.builder.appName("example").getOrCreate()
375
+
376
+ # From list
377
+ df = spark.createDataFrame([(1, "John"), (2, "Jane")], ["id", "name"])
378
+
379
+ # From file (improved in v3.5)
380
+ df = spark.read.csv("data.csv", header=True, inferSchema=True)
381
+ df = spark.read.json("data.json")
382
+ df = spark.read.parquet("data.parquet")
383
+ ```
384
+
385
+ ### DataFrame Operations
386
+
387
+ #### Select
388
+ ```python
389
+ df.select("name").show()
390
+ df.select(df["name"], df["id"] + 1).show()
391
+ ```
392
+
393
+ #### Filter
394
+ ```python
395
+ df.filter(df["id"] > 1).show()
396
+ df.where(df["name"] == "John").show()
397
+ ```
398
+
399
+ #### GroupBy
400
+ ```python
401
+ df.groupBy("name").count().show()
402
+ df.groupBy("department").agg({"salary": "avg"}).show()
403
+ ```
404
+
405
+ #### Join (Enhanced in v3.5)
406
+ ```python
407
+ # New: Support for multiple join types
408
+ df1.join(df2, df1["id"] == df2["id"], "inner").show()
409
+ df1.join(df2, "id", "left_outer").show() # Simplified syntax
410
+ ```
411
+
412
+ ### Schema Definition
413
+ ```python
414
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType
415
+
416
+ schema = StructType([
417
+ StructField("id", IntegerType(), True),
418
+ StructField("name", StringType(), True)
419
+ ])
420
+
421
+ df = spark.createDataFrame(data, schema)
422
+ ```
423
+
424
+ ### Data Types
425
+ - IntegerType, LongType, FloatType, DoubleType
426
+ - StringType, BinaryType
427
+ - BooleanType
428
+ - DateType, TimestampType
429
+ - ArrayType, MapType, StructType
430
+
431
+ ## REMOVED in v3.5
432
+
433
+ **DEPRECATED APIs REMOVED**:
434
+ - `DataFrame.inferSchema()` - Use `spark.read` with `inferSchema=True` instead
435
+ - `SQLContext` - Use `SparkSession` instead
436
+ - Legacy `RDD.toDF()` without schema - Now requires explicit schema
437
+ - Old Window functions syntax - Use new SQL standard syntax
438
+
439
+ **Breaking Changes**:
440
+ - Python 2 support removed
441
+ - Scala 2.11 support removed
442
+ - Legacy Hive metastore APIs removed
443
+
444
+ ## NEW in v3.5
445
+
446
+ **Performance Improvements**:
447
+ - Adaptive Query Execution (AQE) enabled by default
448
+ - Dynamic partition pruning enhancements
449
+ - Better join reordering
450
+
451
+ **New Features**:
452
+ - Built-in ML preprocessing functions
453
+ - Enhanced error messages with suggestions
454
+ - Better compatibility with Pandas 2.0
455
+
456
+ Version: v3.5
457
+ Released: 2023-09
458
+ Major Changes: Removed deprecated APIs, improved performance, Python 2 support dropped
459
+ """,
460
+
461
+ # Healthcare
462
+ "clinical_guidelines_v1.0.txt": """# Clinical Treatment Guidelines v1.0
463
+
464
+ ## Introduction
465
+ These guidelines provide evidence-based recommendations for patient care and treatment protocols.
466
+
467
+ ## General Treatment Protocols
468
+
469
+ ### Patient Assessment
470
+ 1. Initial examination and history taking
471
+ 2. Vital signs measurement
472
+ 3. Physical examination
473
+ 4. Laboratory tests as indicated
474
+ 5. Diagnostic imaging when necessary
475
+
476
+ ### Medication Administration
477
+ - Follow five rights: right patient, right drug, right dose, right route, right time
478
+ - Document all medications given
479
+ - Monitor for adverse reactions
480
+ - Patient education on medication use
481
+
482
+ ### Infection Control
483
+ - Standard precautions for all patients
484
+ - Hand hygiene before and after patient contact
485
+ - Use of personal protective equipment (PPE)
486
+ - Proper disposal of medical waste
487
+ - Environmental cleaning protocols
488
+
489
+ ### Pain Management
490
+ - Assess pain using standardized scales (0-10)
491
+ - Non-pharmacological interventions first
492
+ - Pharmacological options when indicated
493
+ - Regular reassessment and documentation
494
+ - Patient-controlled analgesia when appropriate
495
+
496
+ ### Common Conditions
497
+
498
+ #### Hypertension
499
+ - Target BP: <140/90 mmHg
500
+ - First-line: ACE inhibitors or thiazide diuretics
501
+ - Lifestyle modifications: diet, exercise, stress reduction
502
+ - Regular monitoring and follow-up
503
+
504
+ #### Diabetes Management
505
+ - Target HbA1c: <7%
506
+ - Blood glucose monitoring
507
+ - Insulin or oral hypoglycemics as indicated
508
+ - Dietary counseling
509
+ - Regular foot examinations
510
+
511
+ #### Respiratory Infections
512
+ - Symptomatic treatment
513
+ - Antibiotics only for bacterial infections
514
+ - Rest and hydration
515
+ - Isolation precautions if necessary
516
+
517
+ ### Documentation Requirements
518
+ - All interventions must be documented
519
+ - Adverse events reported immediately
520
+ - Patient progress notes daily
521
+ - Discharge planning initiated early
522
+
523
+ Version: v1.0
524
+ Effective Date: January 2023
525
+ """,
526
+
527
+ "clinical_guidelines_v2.0.txt": """# Clinical Treatment Guidelines v2.0
528
+
529
+ ## Introduction
530
+ These guidelines provide evidence-based recommendations for patient care and treatment protocols.
531
+
532
+ **UPDATED for v2.0**: Incorporates latest research findings and new treatment modalities.
533
+
534
+ ## General Treatment Protocols
535
+
536
+ ### Patient Assessment
537
+ 1. Initial examination and history taking
538
+ 2. Vital signs measurement (now includes SpO2 monitoring)
539
+ 3. Physical examination
540
+ 4. Laboratory tests as indicated
541
+ 5. Diagnostic imaging when necessary
542
+ 6. **NEW**: Risk stratification scoring
543
+
544
+ ### Medication Administration
545
+ - Follow five rights: right patient, right drug, right dose, right route, right time
546
+ - Document all medications given
547
+ - Monitor for adverse reactions
548
+ - Patient education on medication use
549
+ - **NEW**: Electronic verification system required
550
+ - **NEW**: Double-check protocol for high-risk medications
551
+
552
+ ### Infection Control
553
+ - Standard precautions for all patients
554
+ - Hand hygiene before and after patient contact
555
+ - Use of personal protective equipment (PPE)
556
+ - Proper disposal of medical waste
557
+ - Environmental cleaning protocols
558
+ - **NEW**: Enhanced protocols for multi-drug resistant organisms
559
+ - **NEW**: Mandatory staff screening during outbreaks
560
+
561
+ ### Pain Management
562
+ - Assess pain using standardized scales (0-10)
563
+ - Non-pharmacological interventions first
564
+ - Pharmacological options when indicated
565
+ - Regular reassessment and documentation
566
+ - Patient-controlled analgesia when appropriate
567
+ - **NEW**: Multimodal analgesia approach preferred
568
+ - **NEW**: Reduced opioid prescribing guidelines
569
+
570
+ ### Common Conditions
571
+
572
+ #### Hypertension (UPDATED)
573
+ - **NEW Target BP: <130/80 mmHg** (lowered from 140/90)
574
+ - First-line: ACE inhibitors or thiazide diuretics
575
+ - **NEW**: Consider combination therapy for BP >140/90
576
+ - Lifestyle modifications: diet, exercise, stress reduction
577
+ - Regular monitoring and follow-up
578
+ - **NEW**: Home blood pressure monitoring encouraged
579
+
580
+ #### Diabetes Management (UPDATED)
581
+ - Target HbA1c: <7% (individualized for elderly: <8%)
582
+ - Blood glucose monitoring
583
+ - **NEW**: GLP-1 agonists as first-line for cardiovascular benefit
584
+ - Insulin or oral hypoglycemics as indicated
585
+ - Dietary counseling with registered dietitian
586
+ - Regular foot examinations
587
+ - **NEW**: Annual retinal screening mandatory
588
+ - **NEW**: Cardiovascular risk assessment required
589
+
590
+ #### Respiratory Infections
591
+ - Symptomatic treatment
592
+ - Antibiotics only for bacterial infections
593
+ - Rest and hydration
594
+ - Isolation precautions if necessary
595
+ - **NEW**: Rapid PCR testing for influenza and COVID-19
596
+ - **NEW**: Updated isolation protocols
597
+
598
+ ### NEW SECTION: Telemedicine Protocols
599
+ - Video visit guidelines
600
+ - Remote monitoring for chronic conditions
601
+ - Digital prescription protocols
602
+ - Documentation requirements for virtual care
603
+
604
+ ### Documentation Requirements
605
+ - All interventions must be documented in EHR
606
+ - Adverse events reported immediately (within 24 hours)
607
+ - Patient progress notes daily
608
+ - Discharge planning initiated within 24 hours
609
+ - **NEW**: Quality metrics tracking required
610
+ - **NEW**: Patient satisfaction surveys
611
+
612
+ Version: v2.0
613
+ Effective Date: January 2024
614
+ Major Changes: Updated BP targets, new diabetes medications, enhanced infection control, telemedicine added
615
+ """,
616
+
617
+ # Finance
618
+ "compliance_fy2023.txt": """# Financial Compliance Report FY2023
619
+
620
+ ## Regulatory Overview
621
+ This document outlines the compliance requirements for financial reporting and operations for Fiscal Year 2023.
622
+
623
+ ## Key Regulations
624
+
625
+ ### SOX Compliance (Sarbanes-Oxley Act)
626
+ - Section 302: CEO/CFO certification of financial statements
627
+ - Section 404: Internal control assessment
628
+ - Section 409: Real-time disclosure of material changes
629
+ - Annual external audit required
630
+ - Quarterly internal control testing
631
+
632
+ ### Anti-Money Laundering (AML)
633
+ - Customer due diligence (CDD) required
634
+ - Transaction monitoring systems operational
635
+ - Suspicious Activity Reports (SARs) filed when appropriate
636
+ - Employee training completed annually
637
+ - Independent testing of AML program
638
+
639
+ ### Know Your Customer (KYC)
640
+ - Identity verification for all new customers
641
+ - Beneficial ownership identification
642
+ - Enhanced due diligence for high-risk customers
643
+ - Ongoing monitoring and updates
644
+ - Documentation retention for 5 years
645
+
646
+ ### Data Privacy
647
+ - GDPR compliance for EU customers
648
+ - CCPA compliance for California residents
649
+ - Data encryption at rest and in transit
650
+ - Access controls and authentication
651
+ - Breach notification procedures
652
+
653
+ ## Reporting Requirements
654
+
655
+ ### Financial Statements
656
+ - Quarterly 10-Q filings
657
+ - Annual 10-K filing
658
+ - Earnings releases
659
+ - Management Discussion & Analysis (MD&A)
660
+ - Audited financial statements
661
+
662
+ ### Regulatory Filings
663
+ - Form 13F for institutional investment managers
664
+ - Form 4 for insider transactions
665
+ - Schedule 13D/G for beneficial ownership
666
+ - Form 8-K for material events
667
+
668
+ ### Internal Reports
669
+ - Monthly management reports
670
+ - Quarterly compliance certifications
671
+ - Annual risk assessments
672
+ - Internal audit findings
673
+ - Board committee reports
674
+
675
+ ## Risk Management
676
+
677
+ ### Operational Risk
678
+ - Business continuity planning
679
+ - Disaster recovery testing
680
+ - Vendor management oversight
681
+ - Cybersecurity assessments
682
+ - Insurance coverage review
683
+
684
+ ### Market Risk
685
+ - Value at Risk (VaR) calculations
686
+ - Stress testing scenarios
687
+ - Concentration limits
688
+ - Hedging strategies
689
+ - Daily position monitoring
690
+
691
+ ### Credit Risk
692
+ - Credit rating assessments
693
+ - Exposure limits by counterparty
694
+ - Collateral management
695
+ - Provision for loan losses
696
+ - Portfolio diversification
697
+
698
+ ## Compliance Metrics FY2023
699
+ - Total regulatory filings: 48
700
+ - Internal audits conducted: 12
701
+ - Compliance training completion: 98%
702
+ - Zero material violations
703
+ - External audit: Clean opinion
704
+
705
+ Fiscal Year: 2023
706
+ Report Date: December 2023
707
+ """,
708
+
709
+ "compliance_fy2024.txt": """# Financial Compliance Report FY2024
710
+
711
+ ## Regulatory Overview
712
+ This document outlines the compliance requirements for financial reporting and operations for Fiscal Year 2024.
713
+
714
+ **MAJOR UPDATES for FY2024**: New SEC rules, enhanced cybersecurity requirements, and ESG disclosures.
715
+
716
+ ## Key Regulations
717
+
718
+ ### SOX Compliance (Sarbanes-Oxley Act)
719
+ - Section 302: CEO/CFO certification of financial statements
720
+ - Section 404: Internal control assessment
721
+ - Section 409: Real-time disclosure of material changes
722
+ - Annual external audit required
723
+ - Quarterly internal control testing
724
+ - **NEW**: Enhanced documentation requirements
725
+
726
+ ### Anti-Money Laundering (AML)
727
+ - Customer due diligence (CDD) required
728
+ - Transaction monitoring systems operational
729
+ - Suspicious Activity Reports (SARs) filed when appropriate
730
+ - Employee training completed annually
731
+ - Independent testing of AML program
732
+ - **NEW**: Real-time transaction monitoring enhanced
733
+ - **NEW**: Cryptocurrency transaction monitoring added
734
+
735
+ ### Know Your Customer (KYC)
736
+ - Identity verification for all new customers
737
+ - Beneficial ownership identification
738
+ - Enhanced due diligence for high-risk customers
739
+ - Ongoing monitoring and updates
740
+ - Documentation retention for 5 years
741
+ - **NEW**: Biometric verification for high-risk accounts
742
+ - **NEW**: Automated screening against sanctions lists
743
+
744
+ ### Data Privacy (UPDATED)
745
+ - GDPR compliance for EU customers
746
+ - CCPA compliance for California residents
747
+ - **NEW**: CPRA (California Privacy Rights Act) requirements
748
+ - Data encryption at rest and in transit
749
+ - Access controls and multi-factor authentication
750
+ - Breach notification procedures
751
+ - **NEW**: Data mapping and inventory required
752
+ - **NEW**: Privacy impact assessments for new systems
753
+
754
+ ### NEW: Cybersecurity Disclosure Rules
755
+ - **MAJOR ADDITION**: SEC cybersecurity disclosure requirements
756
+ - Material cybersecurity incidents reported within 4 days
757
+ - Annual cybersecurity governance disclosure
758
+ - Board oversight of cybersecurity risk
759
+ - Incident response plan documented and tested
760
+
761
+ ### NEW: ESG Disclosure Requirements
762
+ - **MAJOR ADDITION**: Climate-related disclosure rules
763
+ - Scope 1 and 2 emissions reporting
764
+ - Material climate risks identified
765
+ - Board oversight of climate risks
766
+ - Third-party assurance of emissions data
767
+
768
+ ## Reporting Requirements
769
+
770
+ ### Financial Statements
771
+ - Quarterly 10-Q filings
772
+ - Annual 10-K filing
773
+ - Earnings releases
774
+ - Management Discussion & Analysis (MD&A)
775
+ - Audited financial statements
776
+ - **NEW**: Inline XBRL tagging required
777
+
778
+ ### Regulatory Filings
779
+ - Form 13F for institutional investment managers
780
+ - Form 4 for insider transactions
781
+ - Schedule 13D/G for beneficial ownership
782
+ - Form 8-K for material events
783
+ - **NEW**: Form 8-K for cybersecurity incidents
784
+ - **NEW**: Climate disclosure forms
785
+
786
+ ### Internal Reports
787
+ - Monthly management reports
788
+ - Quarterly compliance certifications
789
+ - Annual risk assessments
790
+ - Internal audit findings
791
+ - Board committee reports
792
+ - **NEW**: Monthly cybersecurity dashboards
793
+ - **NEW**: Quarterly ESG metrics
794
+
795
+ ## Risk Management
796
+
797
+ ### Operational Risk
798
+ - Business continuity planning
799
+ - Disaster recovery testing (now quarterly)
800
+ - Vendor management oversight with annual reviews
801
+ - **NEW**: Third-party risk assessment enhanced
802
+ - Cybersecurity assessments (now monthly)
803
+ - Insurance coverage review
804
+ - **NEW**: Ransomware response protocols
805
+
806
+ ### Market Risk
807
+ - Value at Risk (VaR) calculations
808
+ - Stress testing scenarios (now includes crypto)
809
+ - Concentration limits
810
+ - Hedging strategies
811
+ - Daily position monitoring
812
+ - **NEW**: Climate scenario analysis
813
+
814
+ ### Credit Risk
815
+ - Credit rating assessments
816
+ - Exposure limits by counterparty
817
+ - Collateral management
818
+ - Provision for loan losses (CECL methodology)
819
+ - Portfolio diversification
820
+ - **NEW**: ESG factors in credit analysis
821
+
822
+ ### NEW: Cybersecurity Risk
823
+ - Penetration testing quarterly
824
+ - Vulnerability assessments monthly
825
+ - Security awareness training for all employees
826
+ - Incident response plan tested annually
827
+ - 24/7 security operations center
828
+ - Zero-trust architecture implementation
829
+
830
+ ## Compliance Metrics FY2024
831
+ - Total regulatory filings: 56 (↑17% from FY2023)
832
+ - Internal audits conducted: 16 (↑33%)
833
+ - Compliance training completion: 99.5%
834
+ - Zero material violations
835
+ - External audit: Clean opinion
836
+ - **NEW**: Cybersecurity incidents reported: 0
837
+ - **NEW**: ESG disclosure score: A-
838
+
839
+ Fiscal Year: 2024
840
+ Report Date: December 2024
841
+ Major Changes: New SEC cybersecurity rules, ESG disclosures added, enhanced AML monitoring, CPRA compliance
842
+ """,
843
+
844
+ # Industrial
845
+ "machine_operation_rev1.0.txt": """# Industrial Machine Operation Manual - Rev. 1.0
846
+
847
+ ## Equipment Overview
848
+ High-precision CNC milling machine for metal fabrication operations.
849
+
850
+ Model: IMM-5000
851
+ Serial Number: [Unit Specific]
852
+ Manufacturer: Industrial Machines Inc.
853
+
854
+ ## Safety Requirements
855
+
856
+ ### Personal Protective Equipment (PPE)
857
+ - Safety glasses with side shields (ANSI Z87.1)
858
+ - Steel-toed safety boots
859
+ - Hearing protection (>85 dB areas)
860
+ - Machine operator gloves
861
+ - No loose clothing or jewelry
862
+
863
+ ### Machine Safety Features
864
+ - Emergency stop button (red mushroom head)
865
+ - Safety interlocks on all access doors
866
+ - Light curtain protection system
867
+ - Audible alarm before operation
868
+ - Fire suppression system
869
+
870
+ ## Startup Procedure
871
+
872
+ ### Pre-Startup Checks
873
+ 1. Inspect machine for visible damage or wear
874
+ 2. Check all safety guards are in place
875
+ 3. Verify emergency stop functions properly
876
+ 4. Ensure work area is clean and clear
877
+ 5. Check coolant levels (minimum 80%)
878
+ 6. Inspect cutting tools for wear or damage
879
+ 7. Verify power supply voltage (480V 3-phase)
880
+
881
+ ### Startup Sequence
882
+ 1. Turn main power switch to ON position
883
+ 2. Wait for hydraulic system to pressurize (indicator light)
884
+ 3. Initialize machine control system (press INIT button)
885
+ 4. Perform axis homing sequence (X, Y, Z axes)
886
+ 5. Load machining program into controller
887
+ 6. Verify tool offset data
888
+ 7. Perform dry run without material
889
+ 8. Load workpiece and secure in fixture
890
+ 9. Set spindle speed and feed rate
891
+ 10. Begin machining operation
892
+
893
+ ## Operation
894
+
895
+ ### Standard Operating Parameters
896
+ - Spindle speed range: 100-6000 RPM
897
+ - Feed rate: 1-500 inches per minute
898
+ - Maximum workpiece weight: 2000 lbs
899
+ - Coolant flow rate: 10 GPM
900
+ - Operating temperature: 60-90°F
901
+
902
+ ### Control Panel Functions
903
+ - CYCLE START: Begins programmed operation
904
+ - CYCLE STOP: Pauses operation
905
+ - EMERGENCY STOP: Immediate shutdown
906
+ - FEED HOLD: Temporarily pauses feed motion
907
+ - JOG: Manual axis movement
908
+ - SPINDLE OVERRIDE: Adjust spindle speed (50-150%)
909
+
910
+ ### Monitoring During Operation
911
+ - Watch for unusual vibrations
912
+ - Listen for abnormal sounds
913
+ - Monitor coolant flow
914
+ - Check chip evacuation
915
+ - Verify dimensional accuracy periodically
916
+ - Monitor cutting tool wear
917
+
918
+ ## Shutdown Procedure
919
+
920
+ 1. Complete current machining cycle
921
+ 2. Press CYCLE STOP button
922
+ 3. Return spindle to home position
923
+ 4. Stop spindle rotation
924
+ 5. Turn off coolant system
925
+ 6. Remove workpiece
926
+ 7. Clean machine surfaces and work area
927
+ 8. Shut down control system
928
+ 9. Turn off main power switch
929
+ 10. Complete operator log entry
930
+
931
+ ## Maintenance Schedule
932
+
933
+ ### Daily
934
+ - Clean machine surfaces
935
+ - Check coolant level and condition
936
+ - Inspect cutting tools
937
+ - Verify all safety features
938
+ - Lubricate way surfaces
939
+
940
+ ### Weekly
941
+ - Check hydraulic fluid level
942
+ - Inspect electrical connections
943
+ - Test emergency stop function
944
+ - Clean coolant tank filter
945
+
946
+ ### Monthly
947
+ - Full machine cleaning
948
+ - Lubrication of all grease points
949
+ - Check belt tensions
950
+ - Calibrate tools
951
+ - Inspect safety guards
952
+
953
+ ### Annual
954
+ - Professional maintenance service
955
+ - Complete electrical inspection
956
+ - Hydraulic system service
957
+ - Accuracy verification
958
+ - Safety system certification
959
+
960
+ ## Troubleshooting
961
+
962
+ ### Machine Won't Start
963
+ - Check main power supply
964
+ - Verify emergency stop is reset
965
+ - Check for blown fuses
966
+ - Inspect door interlocks
967
+
968
+ ### Poor Surface Finish
969
+ - Check cutting tool condition
970
+ - Verify proper speeds and feeds
971
+ - Check machine rigidity
972
+ - Inspect coolant flow
973
+
974
+ ### Dimensional Inaccuracy
975
+ - Verify tool offsets
976
+ - Check for thermal growth
977
+ - Inspect ball screws
978
+ - Verify workpiece fixturing
979
+
980
+ Revision: 1.0
981
+ Date: January 2023
982
+ """,
983
+
984
+ "machine_operation_rev2.0.txt": """# Industrial Machine Operation Manual - Rev. 2.0
985
+
986
+ ## Equipment Overview
987
+ High-precision CNC milling machine for metal fabrication operations.
988
+
989
+ Model: IMM-5000
990
+ Serial Number: [Unit Specific]
991
+ Manufacturer: Industrial Machines Inc.
992
+
993
+ **UPDATED Rev. 2.0**: Enhanced safety features, automated monitoring, and improved procedures.
994
+
995
+ ## Safety Requirements
996
+
997
+ ### Personal Protective Equipment (PPE)
998
+ - Safety glasses with side shields (ANSI Z87.1)
999
+ - Steel-toed safety boots
1000
+ - Hearing protection (>85 dB areas)
1001
+ - Machine operator gloves
1002
+ - No loose clothing or jewelry
1003
+ - **NEW**: Cut-resistant sleeves for tool changing
1004
+
1005
+ ### Machine Safety Features
1006
+ - Emergency stop button (red mushroom head)
1007
+ - Safety interlocks on all access doors
1008
+ - Light curtain protection system
1009
+ - Audible alarm before operation
1010
+ - Fire suppression system
1011
+ - **NEW**: Automatic door locking during operation
1012
+ - **NEW**: Collision detection system
1013
+ - **NEW**: Automatic power-off on anomaly detection
1014
+ - **NEW**: Video monitoring system
1015
+ - **NEW**: Operator presence detection
1016
+
1017
+ ### NEW: Enhanced Safety Protocols
1018
+ - **ADDITION**: Two-person operation required for large workpieces
1019
+ - **ADDITION**: Mandatory safety briefing before first daily use
1020
+ - **ADDITION**: Personal lockout/tagout procedures
1021
+ - **ADDITION**: Near-miss reporting system
1022
+ - **ADDITION**: Monthly safety drills
1023
+
1024
+ ## Startup Procedure
1025
+
1026
+ ### Pre-Startup Checks
1027
+ 1. Inspect machine for visible damage or wear
1028
+ 2. Check all safety guards are in place
1029
+ 3. Verify emergency stop functions properly
1030
+ 4. Ensure work area is clean and clear (5S standards)
1031
+ 5. Check coolant levels (minimum 80%)
1032
+ 6. Inspect cutting tools for wear or damage
1033
+ 7. Verify power supply voltage (480V 3-phase)
1034
+ 8. **NEW**: Complete digital pre-start checklist on HMI
1035
+ 9. **NEW**: Verify backup systems operational
1036
+ 10. **NEW**: Check air pressure (90 PSI minimum)
1037
+
1038
+ ### Startup Sequence
1039
+ 1. Turn main power switch to ON position
1040
+ 2. Wait for hydraulic system to pressurize (indicator light)
1041
+ 3. Initialize machine control system (press INIT button)
1042
+ 4. Perform axis homing sequence (X, Y, Z axes)
1043
+ 5. **NEW**: System automatically runs diagnostics
1044
+ 6. Load machining program into controller
1045
+ 7. Verify tool offset data
1046
+ 8. **NEW**: Automatic tool measurement cycle
1047
+ 9. Perform dry run without material
1048
+ 10. Load workpiece and secure in fixture
1049
+ 11. **NEW**: Scan operator badge for authorization
1050
+ 12. Set spindle speed and feed rate
1051
+ 13. **NEW**: System verifies parameters within safe limits
1052
+ 14. Begin machining operation
1053
+
1054
+ ## Operation
1055
+
1056
+ ### Standard Operating Parameters
1057
+ - Spindle speed range: 100-8000 RPM (↑ from 6000)
1058
+ - Feed rate: 1-500 inches per minute
1059
+ - Maximum workpiece weight: 2000 lbs
1060
+ - Coolant flow rate: 10 GPM
1061
+ - Operating temperature: 60-90°F
1062
+ - **NEW**: Automatic parameter optimization based on material
1063
+ - **NEW**: Real-time monitoring and adjustment
1064
+
1065
+ ### Control Panel Functions
1066
+ - CYCLE START: Begins programmed operation
1067
+ - CYCLE STOP: Pauses operation
1068
+ - EMERGENCY STOP: Immediate shutdown
1069
+ - FEED HOLD: Temporarily pauses feed motion
1070
+ - JOG: Manual axis movement
1071
+ - SPINDLE OVERRIDE: Adjust spindle speed (50-150%)
1072
+ - **NEW**: ADAPTIVE CONTROL: Auto-optimizes feeds/speeds
1073
+ - **NEW**: REMOTE MONITORING: View status on mobile app
1074
+
1075
+ ### Monitoring During Operation
1076
+ - Watch for unusual vibrations
1077
+ - Listen for abnormal sounds
1078
+ - Monitor coolant flow
1079
+ - Check chip evacuation
1080
+ - Verify dimensional accuracy periodically
1081
+ - Monitor cutting tool wear
1082
+ - **NEW**: Automated vibration monitoring alerts operator
1083
+ - **NEW**: Tool wear prediction system
1084
+ - **NEW**: Automatic quality checks every 10 parts
1085
+ - **NEW**: Energy consumption tracking
1086
+
1087
+ ### NEW: Automated Features
1088
+ - Automatic tool changer with 40-tool capacity
1089
+ - In-cycle tool measurement
1090
+ - Adaptive feed control
1091
+ - Predictive maintenance alerts
1092
+ - Remote diagnostics capability
1093
+ - Automatic program backup
1094
+ - Production counter with yield tracking
1095
+
1096
+ ## Shutdown Procedure
1097
+
1098
+ 1. Complete current machining cycle
1099
+ 2. Press CYCLE STOP button
1100
+ 3. **NEW**: Allow automatic cooldown cycle (2 minutes)
1101
+ 4. Return spindle to home position
1102
+ 5. Stop spindle rotation
1103
+ 6. Turn off coolant system
1104
+ 7. **NEW**: System automatically drains coolant from spindle
1105
+ 8. Remove workpiece
1106
+ 9. Clean machine surfaces and work area
1107
+ 10. **NEW**: Complete digital operator log on HMI
1108
+ 11. Shut down control system
1109
+ 12. Turn off main power switch
1110
+ 13. **NEW**: System generates daily production report
1111
+
1112
+ ## Maintenance Schedule
1113
+
1114
+ ### Daily
1115
+ - Clean machine surfaces
1116
+ - Check coolant level and condition
1117
+ - Inspect cutting tools
1118
+ - Verify all safety features
1119
+ - Lubricate way surfaces
1120
+ - **NEW**: Review automated diagnostic report
1121
+ - **NEW**: Check chip conveyor operation
1122
+
1123
+ ### Weekly
1124
+ - Check hydraulic fluid level
1125
+ - Inspect electrical connections
1126
+ - Test emergency stop function
1127
+ - Clean coolant tank filter
1128
+ - **NEW**: Review vibration analysis data
1129
+ - **NEW**: Update tool life database
1130
+
1131
+ ### Monthly
1132
+ - Full machine cleaning
1133
+ - Lubrication of all grease points
1134
+ - Check belt tensions
1135
+ - Calibrate tools
1136
+ - Inspect safety guards
1137
+ - **NEW**: Thermal imaging inspection
1138
+ - **NEW**: Backup all programs and parameters
1139
+
1140
+ ### Quarterly (NEW)
1141
+ - Professional calibration service
1142
+ - Update control software
1143
+ - Test all safety interlocks
1144
+ - Inspect for wear on critical components
1145
+ - Review maintenance logs
1146
+
1147
+ ### Annual
1148
+ - Professional maintenance service
1149
+ - Complete electrical inspection
1150
+ - Hydraulic system service
1151
+ - Accuracy verification (laser interferometer)
1152
+ - Safety system certification
1153
+ - **NEW**: Complete machine recalibration
1154
+ - **NEW**: Operator retraining and certification
1155
+
1156
+ ## Troubleshooting
1157
+
1158
+ ### Machine Won't Start
1159
+ - Check main power supply
1160
+ - Verify emergency stop is reset
1161
+ - Check for blown fuses
1162
+ - Inspect door interlocks
1163
+ - **NEW**: Review diagnostic error codes on HMI
1164
+ - **NEW**: Check operator authorization
1165
+
1166
+ ### Poor Surface Finish
1167
+ - Check cutting tool condition
1168
+ - Verify proper speeds and feeds
1169
+ - Check machine rigidity
1170
+ - Inspect coolant flow
1171
+ - **NEW**: Review vibration monitoring data
1172
+ - **NEW**: Check automatic compensation settings
1173
+
1174
+ ### Dimensional Inaccuracy
1175
+ - Verify tool offsets
1176
+ - Check for thermal growth
1177
+ - Inspect ball screws
1178
+ - Verify workpiece fixturing
1179
+ - **NEW**: Run automatic calibration routine
1180
+ - **NEW**: Check environmental temperature
1181
+
1182
+ ### NEW: Automated Diagnostics
1183
+ - System automatically logs errors
1184
+ - Predictive maintenance alerts
1185
+ - Remote support connection available
1186
+ - QR codes for instant technical manual access
1187
+ - Video troubleshooting guides on HMI
1188
+
1189
+ ## NEW SECTION: Industry 4.0 Integration
1190
+ - IoT connectivity for production monitoring
1191
+ - Integration with MES (Manufacturing Execution System)
1192
+ - Real-time OEE (Overall Equipment Effectiveness) tracking
1193
+ - Automatic inventory management of tools and consumables
1194
+ - Predictive maintenance using machine learning
1195
+ - Digital twin simulation capability
1196
+
1197
+ Revision: 2.0
1198
+ Date: January 2024
1199
+ Major Changes: Enhanced safety features (collision detection, presence sensors), automated monitoring, predictive maintenance, Industry 4.0 connectivity, increased spindle speed range
1200
+ """
1201
+ }
1202
+
1203
+ def create_dataset():
1204
+ """Create sample dataset directory and files"""
1205
+ dataset_dir = "sample_data"
1206
+ os.makedirs(dataset_dir, exist_ok=True)
1207
+
1208
+ print(f"Creating sample dataset in '{dataset_dir}' directory...")
1209
+
1210
+ for filename, content in SAMPLE_DOCS.items():
1211
+ filepath = os.path.join(dataset_dir, filename)
1212
+ with open(filepath, "w", encoding="utf-8") as f:
1213
+ f.write(content)
1214
+ print(f"✓ Created {filename}")
1215
+
1216
+ print(f"\n✅ Successfully created {len(SAMPLE_DOCS)} sample documents!")
1217
+ print(f"\nDataset distribution:")
1218
+ print("- Software (Node.js): 3 files")
1219
+ print("- Software (Bootstrap): 2 files")
1220
+ print("- Software (Spark): 2 files")
1221
+ print("- Healthcare: 2 files")
1222
+ print("- Finance: 2 files")
1223
+ print("- Industrial: 2 files")
1224
+ print("=" * 50)
1225
+ print("Total: 13 documents covering 6 domains")
1226
+
1227
+ if __name__ == "__main__":
1228
+ create_dataset()
evaluation.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation.py - Evaluation System
2
+ from typing import List, Dict, Tuple
3
+ import time
4
+ import numpy as np
5
+ from dataclasses import dataclass
6
+ import json
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+ @dataclass
10
+ class Question:
11
+ """Represents a single evaluation question"""
12
+ query: str
13
+ query_type: str # content_retrieval, version_inquiry, change_retrieval
14
+ expected_answer: str
15
+ expected_version: str
16
+ domain: str
17
+ topic: str
18
+ expected_keywords: List[str] = None
19
+
20
+ class VersionQADataset:
21
+ """Dataset for evaluating version-aware QA"""
22
+
23
+ def __init__(self, questions: List[Question]):
24
+ self.questions = questions
25
+
26
+ @classmethod
27
+ def create_mini_versionqa(cls) -> 'VersionQADataset':
28
+ """Create the Mini-VersionQA dataset as specified"""
29
+ questions = [
30
+ # Software - Node.js Assert
31
+ Question(
32
+ query="What is the assert module in Node.js v20.0?",
33
+ query_type="content_retrieval",
34
+ expected_answer="assert module provides testing functions",
35
+ expected_version="v20.0",
36
+ domain="Software",
37
+ topic="Node.js Assert",
38
+ expected_keywords=["assert", "testing", "module"]
39
+ ),
40
+ Question(
41
+ query="List all versions of the assert module",
42
+ query_type="version_inquiry",
43
+ expected_answer="v20.0, v21.0, v23.0",
44
+ expected_version="all",
45
+ domain="Software",
46
+ topic="Node.js Assert",
47
+ expected_keywords=["v20.0", "v21.0", "v23.0"]
48
+ ),
49
+ Question(
50
+ query="When was the strict mode added to assert?",
51
+ query_type="change_retrieval",
52
+ expected_answer="v21.0",
53
+ expected_version="v21.0",
54
+ domain="Software",
55
+ topic="Node.js Assert",
56
+ expected_keywords=["strict", "mode", "v21.0"]
57
+ ),
58
+
59
+ # Software - Bootstrap
60
+ Question(
61
+ query="What are the grid classes in Bootstrap v5.2?",
62
+ query_type="content_retrieval",
63
+ expected_answer="col-*, row classes for responsive grid",
64
+ expected_version="v5.2",
65
+ domain="Software",
66
+ topic="Bootstrap",
67
+ expected_keywords=["grid", "col", "row"]
68
+ ),
69
+ Question(
70
+ query="What changed in Bootstrap from v5.2 to v5.3?",
71
+ query_type="change_retrieval",
72
+ expected_answer="new utility classes and improvements",
73
+ expected_version="v5.3",
74
+ domain="Software",
75
+ topic="Bootstrap",
76
+ expected_keywords=["utility", "classes", "v5.3"]
77
+ ),
78
+
79
+ # Software - Spark
80
+ Question(
81
+ query="How does DataFrame work in Spark v3.0?",
82
+ query_type="content_retrieval",
83
+ expected_answer="distributed collection of data organized into named columns",
84
+ expected_version="v3.0",
85
+ domain="Software",
86
+ topic="Spark",
87
+ expected_keywords=["dataframe", "distributed", "columns"]
88
+ ),
89
+ Question(
90
+ query="What was removed in Spark v3.5?",
91
+ query_type="change_retrieval",
92
+ expected_answer="deprecated APIs and legacy features",
93
+ expected_version="v3.5",
94
+ domain="Software",
95
+ topic="Spark",
96
+ expected_keywords=["removed", "deprecated", "v3.5"]
97
+ ),
98
+
99
+ # Healthcare
100
+ Question(
101
+ query="What are the treatment guidelines in v1.0?",
102
+ query_type="content_retrieval",
103
+ expected_answer="standard treatment protocols for patient care",
104
+ expected_version="v1.0",
105
+ domain="Healthcare",
106
+ topic="Clinical Guidelines",
107
+ expected_keywords=["treatment", "protocols", "guidelines"]
108
+ ),
109
+ Question(
110
+ query="What changed in clinical guidelines from v1.0 to v2.0?",
111
+ query_type="change_retrieval",
112
+ expected_answer="updated treatment protocols and new recommendations",
113
+ expected_version="v2.0",
114
+ domain="Healthcare",
115
+ topic="Clinical Guidelines",
116
+ expected_keywords=["updated", "protocols", "v2.0"]
117
+ ),
118
+
119
+ # Finance
120
+ Question(
121
+ query="What are the compliance requirements in FY2023?",
122
+ query_type="content_retrieval",
123
+ expected_answer="regulatory compliance requirements for financial reporting",
124
+ expected_version="FY2023",
125
+ domain="Finance",
126
+ topic="Compliance Reports",
127
+ expected_keywords=["compliance", "requirements", "regulatory"]
128
+ ),
129
+ Question(
130
+ query="What regulations changed from FY2023 to FY2024?",
131
+ query_type="change_retrieval",
132
+ expected_answer="new regulatory requirements and updated compliance standards",
133
+ expected_version="FY2024",
134
+ domain="Finance",
135
+ topic="Compliance Reports",
136
+ expected_keywords=["regulations", "changed", "FY2024"]
137
+ ),
138
+
139
+ # Industrial
140
+ Question(
141
+ query="What is the startup procedure in Rev. 1.0?",
142
+ query_type="content_retrieval",
143
+ expected_answer="machine startup steps and initialization procedures",
144
+ expected_version="Rev. 1.0",
145
+ domain="Industrial",
146
+ topic="Machine Operation",
147
+ expected_keywords=["startup", "procedure", "machine"]
148
+ ),
149
+ Question(
150
+ query="What safety features were added in Rev. 2.0?",
151
+ query_type="change_retrieval",
152
+ expected_answer="enhanced safety features and emergency protocols",
153
+ expected_version="Rev. 2.0",
154
+ domain="Industrial",
155
+ topic="Machine Operation",
156
+ expected_keywords=["safety", "features", "Rev. 2.0"]
157
+ ),
158
+ ]
159
+
160
+ return cls(questions)
161
+
162
+ @classmethod
163
+ def from_dict(cls, data: List[Dict]) -> 'VersionQADataset':
164
+ """Load dataset from dictionary"""
165
+ questions = []
166
+ for q in data:
167
+ questions.append(Question(
168
+ query=q['query'],
169
+ query_type=q['query_type'],
170
+ expected_answer=q['expected_answer'],
171
+ expected_version=q['expected_version'],
172
+ domain=q['domain'],
173
+ topic=q['topic'],
174
+ expected_keywords=q.get('expected_keywords', [])
175
+ ))
176
+ return cls(questions)
177
+
178
+ def to_dict(self) -> List[Dict]:
179
+ """Convert dataset to dictionary"""
180
+ return [
181
+ {
182
+ 'query': q.query,
183
+ 'query_type': q.query_type,
184
+ 'expected_answer': q.expected_answer,
185
+ 'expected_version': q.expected_version,
186
+ 'domain': q.domain,
187
+ 'topic': q.topic,
188
+ 'expected_keywords': q.expected_keywords
189
+ }
190
+ for q in self.questions
191
+ ]
192
+
193
+ class Evaluator:
194
+ """Evaluates VersionRAG and Baseline systems"""
195
+
196
+ def __init__(self, version_rag, baseline_rag):
197
+ self.version_rag = version_rag
198
+ self.baseline_rag = baseline_rag
199
+
200
+ def evaluate(self, dataset: VersionQADataset) -> Dict:
201
+ """Run full evaluation on dataset"""
202
+ versionrag_results = []
203
+ baseline_results = []
204
+
205
+ for question in dataset.questions:
206
+ # Evaluate VersionRAG
207
+ start_time = time.time()
208
+
209
+ try:
210
+ if question.query_type == "content_retrieval":
211
+ vrag_answer = self.version_rag.query(
212
+ query=question.query,
213
+ version_filter=question.expected_version if question.expected_version != "all" else None
214
+ )
215
+ elif question.query_type == "version_inquiry":
216
+ vrag_answer = self.version_rag.version_inquiry(question.query)
217
+ else: # change_retrieval
218
+ vrag_answer = self.version_rag.change_retrieval(question.query)
219
+
220
+ vrag_latency = time.time() - start_time
221
+ except Exception as e:
222
+ print(f"VersionRAG error on '{question.query}': {e}")
223
+ vrag_answer = {'answer': '', 'sources': []}
224
+ vrag_latency = 0
225
+
226
+ # Evaluate Baseline
227
+ start_time = time.time()
228
+ try:
229
+ baseline_answer = self.baseline_rag.query(question.query)
230
+ baseline_latency = time.time() - start_time
231
+ except Exception as e:
232
+ print(f"Baseline error on '{question.query}': {e}")
233
+ baseline_answer = {'answer': '', 'sources': []}
234
+ baseline_latency = 0
235
+
236
+ # Score answers
237
+ vrag_score = self._score_answer(
238
+ vrag_answer.get('answer', ''),
239
+ question.expected_answer,
240
+ vrag_answer.get('sources', []),
241
+ question.expected_version,
242
+ question.expected_keywords
243
+ )
244
+
245
+ baseline_score = self._score_answer(
246
+ baseline_answer.get('answer', ''),
247
+ question.expected_answer,
248
+ baseline_answer.get('sources', []),
249
+ question.expected_version,
250
+ question.expected_keywords
251
+ )
252
+
253
+ versionrag_results.append({
254
+ 'question': question,
255
+ 'score': vrag_score,
256
+ 'latency': vrag_latency,
257
+ 'answer': vrag_answer.get('answer', '')
258
+ })
259
+
260
+ baseline_results.append({
261
+ 'question': question,
262
+ 'score': baseline_score,
263
+ 'latency': baseline_latency,
264
+ 'answer': baseline_answer.get('answer', '')
265
+ })
266
+
267
+ # Compute metrics
268
+ versionrag_metrics = self._compute_metrics(versionrag_results)
269
+ baseline_metrics = self._compute_metrics(baseline_results)
270
+
271
+ return {
272
+ 'versionrag': versionrag_metrics,
273
+ 'baseline': baseline_metrics,
274
+ 'questions': len(dataset.questions),
275
+ 'improvement': {
276
+ 'accuracy': versionrag_metrics['accuracy'] - baseline_metrics['accuracy'],
277
+ 'vsa': versionrag_metrics['vsa'] - baseline_metrics['vsa'],
278
+ 'hit_at_5': versionrag_metrics['hit_at_5'] - baseline_metrics['hit_at_5']
279
+ }
280
+ }
281
+
282
+ def _score_answer(self, answer: str, expected: str, sources: List[Dict],
283
+ expected_version: str, expected_keywords: List[str] = None) -> Dict:
284
+ """Score an answer based on correctness and version awareness"""
285
+ if not answer:
286
+ return {
287
+ 'content_score': 0.0,
288
+ 'version_score': 0.0,
289
+ 'keyword_score': 0.0,
290
+ 'total_score': 0.0
291
+ }
292
+
293
+ # Keyword-based content scoring
294
+ expected_keywords_set = set(expected.lower().split())
295
+ if expected_keywords:
296
+ expected_keywords_set.update([k.lower() for k in expected_keywords])
297
+
298
+ answer_keywords = set(answer.lower().split())
299
+
300
+ # Compute overlap
301
+ overlap = len(expected_keywords_set & answer_keywords)
302
+ keyword_score = min(overlap / max(len(expected_keywords_set), 1), 1.0)
303
+
304
+ # Semantic similarity (simple word overlap as proxy)
305
+ answer_words = answer.lower().split()
306
+ expected_words = expected.lower().split()
307
+
308
+ common_words = set(answer_words) & set(expected_words)
309
+ if len(expected_words) > 0:
310
+ content_score = len(common_words) / len(expected_words)
311
+ else:
312
+ content_score = 0.0
313
+
314
+ # Boost score if answer is longer and contains key terms
315
+ if len(answer) > 20 and keyword_score > 0.3:
316
+ content_score = min(content_score * 1.2, 1.0)
317
+
318
+ # Check version awareness
319
+ version_score = self._compute_version_score(sources, expected_version)
320
+
321
+ # Combined score
322
+ total_score = (content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2)
323
+
324
+ return {
325
+ 'content_score': content_score,
326
+ 'version_score': version_score,
327
+ 'keyword_score': keyword_score,
328
+ 'total_score': total_score
329
+ }
330
+
331
+ def _compute_version_score(self, sources: List[Dict], expected_version: str) -> float:
332
+ """Compute version-awareness score"""
333
+ if expected_version == "all":
334
+ # For version inquiry, check if multiple versions are present
335
+ versions_in_sources = set()
336
+ for source in sources:
337
+ if isinstance(source, dict):
338
+ version = source.get('version', 'N/A')
339
+ if version != 'N/A':
340
+ versions_in_sources.add(version)
341
+
342
+ # Score based on number of versions found (more is better)
343
+ return min(len(versions_in_sources) / 3.0, 1.0)
344
+ else:
345
+ # For specific version, check if expected version is in sources
346
+ for source in sources:
347
+ if isinstance(source, dict):
348
+ version = source.get('version', '')
349
+ if expected_version in str(version):
350
+ return 1.0
351
+ return 0.0
352
+
353
+ def _compute_metrics(self, results: List[Dict]) -> Dict:
354
+ """Compute evaluation metrics"""
355
+ if not results:
356
+ return {
357
+ 'accuracy': 0.0,
358
+ 'hit_at_5': 0.0,
359
+ 'mrr': 0.0,
360
+ 'vsa': 0.0,
361
+ 'avg_latency': 0.0,
362
+ 'by_type': {
363
+ 'content_retrieval': 0.0,
364
+ 'version_inquiry': 0.0,
365
+ 'change_retrieval': 0.0
366
+ }
367
+ }
368
+
369
+ # Overall metrics
370
+ total_scores = [r['score']['total_score'] for r in results]
371
+ content_scores = [r['score']['content_score'] for r in results]
372
+ version_scores = [r['score']['version_score'] for r in results]
373
+ latencies = [r['latency'] for r in results]
374
+
375
+ # Hit@k (consider hit if score > 0.5)
376
+ hits = [1 if score > 0.5 else 0 for score in total_scores]
377
+
378
+ # MRR (Mean Reciprocal Rank)
379
+ # Assume rank 1 if score > 0.7, rank 2 if > 0.5, rank 3 if > 0.3, else rank 5
380
+ reciprocal_ranks = []
381
+ for score in total_scores:
382
+ if score > 0.7:
383
+ reciprocal_ranks.append(1.0)
384
+ elif score > 0.5:
385
+ reciprocal_ranks.append(1/2)
386
+ elif score > 0.3:
387
+ reciprocal_ranks.append(1/3)
388
+ else:
389
+ reciprocal_ranks.append(1/5)
390
+
391
+ # By query type
392
+ by_type = {
393
+ 'content_retrieval': [],
394
+ 'version_inquiry': [],
395
+ 'change_retrieval': []
396
+ }
397
+
398
+ for result in results:
399
+ qtype = result['question'].query_type
400
+ by_type[qtype].append(result['score']['total_score'])
401
+
402
+ return {
403
+ 'accuracy': np.mean(total_scores) * 100,
404
+ 'hit_at_5': np.mean(hits) * 100,
405
+ 'mrr': np.mean(reciprocal_ranks),
406
+ 'vsa': np.mean(version_scores) * 100, # Version-Sensitive Accuracy
407
+ 'avg_latency': np.mean(latencies) if latencies else 0,
408
+ 'by_type': {
409
+ 'content_retrieval': np.mean(by_type['content_retrieval']) * 100 if by_type['content_retrieval'] else 0,
410
+ 'version_inquiry': np.mean(by_type['version_inquiry']) * 100 if by_type['version_inquiry'] else 0,
411
+ 'change_retrieval': np.mean(by_type['change_retrieval']) * 100 if by_type['change_retrieval'] else 0
412
+ }
413
+ }
graph_manager.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # graph_manager.py - Version Graph Management
2
+ import networkx as nx
3
+ from typing import List, Dict, Optional, Set
4
+ import json
5
+ from datetime import datetime
6
+ import difflib
7
+
8
+ class GraphManager:
9
+ """Manages version graph with documents, versions, and changes"""
10
+
11
+ def __init__(self, user_id: str):
12
+ self.user_id = user_id
13
+ self.graph = nx.DiGraph()
14
+ self.document_versions = {} # document_name -> [versions]
15
+ self.version_content = {} # (document, version) -> content
16
+
17
+ def add_document_version(self, document_name: str, version: str,
18
+ content: str, metadata: Dict = None):
19
+ """Add a new version of a document to the graph"""
20
+ # Create document node if it doesn't exist
21
+ if document_name not in self.graph:
22
+ self.graph.add_node(document_name, node_type='document',
23
+ metadata=metadata or {})
24
+ self.document_versions[document_name] = []
25
+
26
+ # Create version node
27
+ version_node = f"{document_name}:{version}"
28
+ self.graph.add_node(
29
+ version_node,
30
+ node_type='version',
31
+ version=version,
32
+ document=document_name,
33
+ timestamp=datetime.now().isoformat(),
34
+ metadata=metadata or {}
35
+ )
36
+
37
+ # Link document to version
38
+ self.graph.add_edge(document_name, version_node, edge_type='has_version')
39
+
40
+ # Store content
41
+ self.version_content[(document_name, version)] = content
42
+
43
+ # Add to version list
44
+ if version not in self.document_versions[document_name]:
45
+ self.document_versions[document_name].append(version)
46
+ self.document_versions[document_name].sort()
47
+
48
+ # Link to previous version if exists
49
+ versions = self.document_versions[document_name]
50
+ if len(versions) > 1:
51
+ prev_version = versions[versions.index(version) - 1]
52
+ prev_node = f"{document_name}:{prev_version}"
53
+ self.graph.add_edge(prev_node, version_node, edge_type='next_version')
54
+
55
+ def add_version_with_changes(self, document_name: str, version: str,
56
+ changes: Dict):
57
+ """Add a version with explicit change tracking"""
58
+ version_node = f"{document_name}:{version}"
59
+
60
+ # Create change node
61
+ change_node = f"{version_node}:changes"
62
+ self.graph.add_node(
63
+ change_node,
64
+ node_type='changes',
65
+ additions=changes.get('additions', []),
66
+ deletions=changes.get('deletions', []),
67
+ modifications=changes.get('modifications', []),
68
+ timestamp=datetime.now().isoformat()
69
+ )
70
+
71
+ # Link version to changes
72
+ self.graph.add_edge(version_node, change_node, edge_type='has_changes')
73
+
74
+ def get_all_documents(self) -> List[str]:
75
+ """Get list of all documents"""
76
+ return [node for node, data in self.graph.nodes(data=True)
77
+ if data.get('node_type') == 'document']
78
+
79
+ def get_document_versions(self, document_name: str) -> List[str]:
80
+ """Get all versions of a document"""
81
+ return self.document_versions.get(document_name, [])
82
+
83
+ def get_version_info(self, document_name: str, version: str) -> Dict:
84
+ """Get information about a specific version"""
85
+ version_node = f"{document_name}:{version}"
86
+ if version_node in self.graph:
87
+ return self.graph.nodes[version_node]
88
+ return {}
89
+
90
+ def get_changes_between_versions(self, document_name: str,
91
+ version1: str, version2: str) -> Dict:
92
+ """Compute changes between two versions"""
93
+ content1 = self.version_content.get((document_name, version1), "")
94
+ content2 = self.version_content.get((document_name, version2), "")
95
+
96
+ if not content1 or not content2:
97
+ return {'additions': [], 'deletions': [], 'modifications': []}
98
+
99
+ # Compute diff
100
+ lines1 = content1.split('\n')
101
+ lines2 = content2.split('\n')
102
+
103
+ diff = difflib.unified_diff(lines1, lines2, lineterm='')
104
+
105
+ additions = []
106
+ deletions = []
107
+ modifications = []
108
+
109
+ for line in diff:
110
+ if line.startswith('+') and not line.startswith('+++'):
111
+ additions.append(line[1:])
112
+ elif line.startswith('-') and not line.startswith('---'):
113
+ deletions.append(line[1:])
114
+ elif line.startswith('?'):
115
+ modifications.append(line[1:])
116
+
117
+ return {
118
+ 'additions': additions[:10], # Limit for display
119
+ 'deletions': deletions[:10],
120
+ 'modifications': modifications[:10]
121
+ }
122
+
123
+ def query_version_graph(self, query: str) -> List[Dict]:
124
+ """Query the version graph for relevant versions"""
125
+ results = []
126
+
127
+ for node, data in self.graph.nodes(data=True):
128
+ if data.get('node_type') == 'version':
129
+ # Simple keyword matching (can be enhanced with embeddings)
130
+ if any(term.lower() in str(data).lower() for term in query.split()):
131
+ results.append({
132
+ 'node': node,
133
+ 'data': data
134
+ })
135
+
136
+ return results
137
+
138
+ def export_graph(self) -> Dict:
139
+ """Export graph structure"""
140
+ return {
141
+ 'nodes': dict(self.graph.nodes(data=True)),
142
+ 'edges': list(self.graph.edges(data=True)),
143
+ 'document_versions': self.document_versions
144
+ }
145
+
146
+ def import_graph(self, graph_data: Dict):
147
+ """Import graph structure"""
148
+ self.graph = nx.DiGraph()
149
+
150
+ for node, data in graph_data['nodes'].items():
151
+ self.graph.add_node(node, **data)
152
+
153
+ for source, target, data in graph_data['edges']:
154
+ self.graph.add_edge(source, target, **data)
155
+
156
+ self.document_versions = graph_data.get('document_versions', {})
utils.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py - Utility Functions
2
+ import PyPDF2
3
+ import io
4
+ import difflib
5
+ from typing import List, Dict
6
+ import hashlib
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+
11
+ class DocumentProcessor:
12
+ """Document processing utilities"""
13
+
14
+ @staticmethod
15
+ def extract_text_from_pdf(pdf_bytes: bytes) -> str:
16
+ """Extract text from PDF bytes"""
17
+ try:
18
+ pdf_file = io.BytesIO(pdf_bytes)
19
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
20
+
21
+ text = ""
22
+ for page in pdf_reader.pages:
23
+ text += page.extract_text() + "\n"
24
+
25
+ return text
26
+ except Exception as e:
27
+ raise Exception(f"Error extracting PDF text: {str(e)}")
28
+
29
+ @staticmethod
30
+ def compute_hash(content: bytes) -> str:
31
+ """Compute SHA-256 hash of content"""
32
+ return hashlib.sha256(content).hexdigest()
33
+
34
+ @staticmethod
35
+ def chunk_text(text: str, chunk_size: int = 1000,
36
+ overlap: int = 200) -> List[str]:
37
+ """Split text into overlapping chunks"""
38
+ chunks = []
39
+ start = 0
40
+
41
+ while start < len(text):
42
+ end = start + chunk_size
43
+ chunk = text[start:end]
44
+ if chunk.strip(): # Only add non-empty chunks
45
+ chunks.append(chunk)
46
+ start = end - overlap
47
+
48
+ return chunks
49
+
50
+ class ChangeDetector:
51
+ """Detect changes between document versions"""
52
+
53
+ @staticmethod
54
+ def compute_diff(old_text: str, new_text: str) -> Dict:
55
+ """Compute differences between two text versions"""
56
+ old_lines = old_text.split('\n')
57
+ new_lines = new_text.split('\n')
58
+
59
+ differ = difflib.Differ()
60
+ diff = list(differ.compare(old_lines, new_lines))
61
+
62
+ additions = []
63
+ deletions = []
64
+ modifications = []
65
+
66
+ for line in diff:
67
+ if line.startswith('+ '):
68
+ additions.append(line[2:])
69
+ elif line.startswith('- '):
70
+ deletions.append(line[2:])
71
+ elif line.startswith('? '):
72
+ modifications.append(line[2:])
73
+
74
+ return {
75
+ 'additions': additions,
76
+ 'deletions': deletions,
77
+ 'modifications': modifications
78
+ }
79
+
80
+ @staticmethod
81
+ def semantic_change_detection(old_text: str, new_text: str,
82
+ embeddings) -> List[Dict]:
83
+ """Detect semantic changes using embeddings"""
84
+ old_chunks = DocumentProcessor.chunk_text(old_text)
85
+ new_chunks = DocumentProcessor.chunk_text(new_text)
86
+
87
+ try:
88
+ old_embeddings = embeddings.embed_documents(old_chunks)
89
+ new_embeddings = embeddings.embed_documents(new_chunks)
90
+
91
+ # This is a simplified version - can be enhanced with
92
+ # more sophisticated change detection algorithms
93
+ changes = []
94
+
95
+ return changes
96
+ except Exception as e:
97
+ print(f"Error in semantic change detection: {e}")
98
+ return []
99
+
100
+ class PersistentStorage:
101
+ """Handle persistent storage of metadata"""
102
+
103
+ def __init__(self, user_id: str):
104
+ self.user_id = user_id
105
+ self.storage_dir = Path(f"./user_data_{user_id}")
106
+ self.storage_dir.mkdir(exist_ok=True)
107
+ self.metadata_file = self.storage_dir / "uploaded_files.json"
108
+
109
+ def save_metadata(self, metadata: Dict):
110
+ """Save uploaded files metadata"""
111
+ try:
112
+ with open(self.metadata_file, 'w') as f:
113
+ json.dump(metadata, f, indent=2)
114
+ except Exception as e:
115
+ print(f"Error saving metadata: {e}")
116
+
117
+ def load_metadata(self) -> Dict:
118
+ """Load uploaded files metadata"""
119
+ if self.metadata_file.exists():
120
+ try:
121
+ with open(self.metadata_file, 'r') as f:
122
+ return json.load(f)
123
+ except Exception as e:
124
+ print(f"Error loading metadata: {e}")
125
+ return {}
126
+ return {}
127
+
128
+ def clear_metadata(self):
129
+ """Clear all metadata"""
130
+ if self.metadata_file.exists():
131
+ self.metadata_file.unlink()
version_rag.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # version_rag.py - Core VersionRAG Implementation (OpenAI Embeddings)
2
+ import chromadb
3
+ from chromadb.config import Settings
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
6
+ from typing import List, Dict, Optional
7
+ import os
8
+ from datetime import datetime
9
+ import uuid
10
+
11
+ class VersionRAG:
12
+ """Version-Aware RAG System with Graph + Vector Store"""
13
+
14
+ def __init__(self, user_id: str, model_name: str = "gpt-3.5-turbo",
15
+ embedding_model: str = "text-embedding-3-small"):
16
+ self.user_id = user_id
17
+ self.model_name = model_name
18
+
19
+ # Initialize embeddings - Using OpenAI instead of sentence-transformers
20
+ self.embeddings = OpenAIEmbeddings(model=embedding_model)
21
+
22
+ # Initialize ChromaDB with persistence
23
+ persist_dir = f"./chroma_db_{user_id}"
24
+ os.makedirs(persist_dir, exist_ok=True)
25
+
26
+ self.chroma_client = chromadb.PersistentClient(path=persist_dir)
27
+
28
+ # Create collection with tenant metadata
29
+ collection_name = f"versionrag_{user_id}"
30
+ try:
31
+ self.collection = self.chroma_client.get_collection(name=collection_name)
32
+ except:
33
+ self.collection = self.chroma_client.create_collection(
34
+ name=collection_name,
35
+ metadata={"tenant_id": user_id}
36
+ )
37
+
38
+ # Initialize LLM
39
+ self.llm = ChatOpenAI(
40
+ model_name=model_name,
41
+ temperature=0
42
+ )
43
+
44
+ # Text splitter
45
+ self.text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size=1000,
47
+ chunk_overlap=200,
48
+ length_function=len
49
+ )
50
+
51
+ self.documents = []
52
+ self.metadatas = []
53
+ self.graph_manager = None
54
+
55
+ def set_graph_manager(self, graph_manager):
56
+ """Set graph manager for version tracking"""
57
+ self.graph_manager = graph_manager
58
+
59
+ def add_documents(self, texts: List[str], metadatas: List[Dict], changes: Optional[List[Dict]] = None):
60
+ """Add documents to the vector store with version metadata and changes"""
61
+ all_chunks = []
62
+ all_chunk_metadatas = []
63
+ all_ids = []
64
+
65
+ for idx, (text, metadata) in enumerate(zip(texts, metadatas)):
66
+ # Split text into chunks
67
+ chunks = self.text_splitter.split_text(text)
68
+
69
+ # Add tenant_id to metadata
70
+ for chunk_idx, chunk in enumerate(chunks):
71
+ chunk_metadata = metadata.copy()
72
+ chunk_metadata['tenant_id'] = self.user_id
73
+ chunk_metadata['chunk_id'] = len(all_chunks)
74
+ chunk_metadata['doc_type'] = 'content'
75
+
76
+ all_chunks.append(chunk)
77
+ all_chunk_metadatas.append(chunk_metadata)
78
+ all_ids.append(f"{self.user_id}_content_{uuid.uuid4()}")
79
+
80
+ # Add change information if provided
81
+ if changes and idx < len(changes) and changes[idx]:
82
+ change_info = changes[idx]
83
+
84
+ # Add additions as separate chunks
85
+ for addition in change_info.get('additions', [])[:20]:
86
+ if len(addition.strip()) > 10:
87
+ change_metadata = metadata.copy()
88
+ change_metadata['tenant_id'] = self.user_id
89
+ change_metadata['doc_type'] = 'change'
90
+ change_metadata['change_type'] = 'addition'
91
+
92
+ all_chunks.append(f"[ADDITION in {metadata.get('version')}] {addition}")
93
+ all_chunk_metadatas.append(change_metadata)
94
+ all_ids.append(f"{self.user_id}_change_{uuid.uuid4()}")
95
+
96
+ # Add deletions as separate chunks
97
+ for deletion in change_info.get('deletions', [])[:20]:
98
+ if len(deletion.strip()) > 10:
99
+ change_metadata = metadata.copy()
100
+ change_metadata['tenant_id'] = self.user_id
101
+ change_metadata['doc_type'] = 'change'
102
+ change_metadata['change_type'] = 'deletion'
103
+
104
+ all_chunks.append(f"[DELETION in {metadata.get('version')}] {deletion}")
105
+ all_chunk_metadatas.append(change_metadata)
106
+ all_ids.append(f"{self.user_id}_change_{uuid.uuid4()}")
107
+
108
+ # Add modifications as separate chunks
109
+ for modification in change_info.get('modifications', [])[:20]:
110
+ if len(modification.strip()) > 10:
111
+ change_metadata = metadata.copy()
112
+ change_metadata['tenant_id'] = self.user_id
113
+ change_metadata['doc_type'] = 'change'
114
+ change_metadata['change_type'] = 'modification'
115
+
116
+ all_chunks.append(f"[MODIFICATION in {metadata.get('version')}] {modification}")
117
+ all_chunk_metadatas.append(change_metadata)
118
+ all_ids.append(f"{self.user_id}_change_{uuid.uuid4()}")
119
+
120
+ # Add to ChromaDB
121
+ if all_chunks:
122
+ embeddings = self.embeddings.embed_documents(all_chunks)
123
+
124
+ self.collection.add(
125
+ embeddings=embeddings,
126
+ documents=all_chunks,
127
+ metadatas=all_chunk_metadatas,
128
+ ids=all_ids
129
+ )
130
+
131
+ self.documents.extend(all_chunks)
132
+ self.metadatas.extend(all_chunk_metadatas)
133
+
134
+ def query(self, query: str, version_filter: Optional[str] = None,
135
+ top_k: int = 5) -> Dict:
136
+ """Query with version awareness"""
137
+ # Embed query
138
+ query_embedding = self.embeddings.embed_query(query)
139
+
140
+ # Build where clause for filtering
141
+ if version_filter:
142
+ where = {
143
+ "$and": [
144
+ {"tenant_id": self.user_id},
145
+ {"doc_type": "content"},
146
+ {"version": version_filter}
147
+ ]
148
+ }
149
+ else:
150
+ where = {
151
+ "$and": [
152
+ {"tenant_id": self.user_id},
153
+ {"doc_type": "content"}
154
+ ]
155
+ }
156
+
157
+ # Query ChromaDB
158
+ try:
159
+ results = self.collection.query(
160
+ query_embeddings=[query_embedding],
161
+ n_results=top_k,
162
+ where=where
163
+ )
164
+ except Exception as e:
165
+ return {
166
+ 'answer': f"Error querying database: {str(e)}",
167
+ 'sources': []
168
+ }
169
+
170
+ # Extract results
171
+ if not results['documents'][0]:
172
+ return {
173
+ 'answer': "No relevant documents found.",
174
+ 'sources': []
175
+ }
176
+
177
+ # Prepare context
178
+ context_docs = results['documents'][0]
179
+ context_metadatas = results['metadatas'][0]
180
+ distances = results['distances'][0]
181
+
182
+ # Build context string
183
+ context = "\n\n".join([
184
+ f"[Version {meta.get('version', 'N/A')} - {meta.get('topic', 'Unknown')}]\n{doc}"
185
+ for doc, meta in zip(context_docs, context_metadatas)
186
+ ])
187
+
188
+ # Generate answer using LLM
189
+ prompt = f"""Based on the following context, answer the question.
190
+ If the answer includes version-specific information, explicitly mention the version.
191
+ Be precise and cite the version when relevant.
192
+
193
+ Context:
194
+ {context}
195
+
196
+ Question: {query}
197
+
198
+ Answer:"""
199
+
200
+ try:
201
+ response = self.llm.invoke(prompt)
202
+ answer = response.content if hasattr(response, 'content') else str(response)
203
+ except Exception as e:
204
+ answer = f"Error generating answer: {str(e)}"
205
+
206
+ # Prepare sources
207
+ sources = []
208
+ for doc, meta, dist in zip(context_docs, context_metadatas, distances):
209
+ sources.append({
210
+ 'content': doc,
211
+ 'version': meta.get('version', 'N/A'),
212
+ 'filename': meta.get('filename', 'N/A'),
213
+ 'domain': meta.get('domain', 'N/A'),
214
+ 'topic': meta.get('topic', 'N/A'),
215
+ 'similarity': 1 - dist
216
+ })
217
+
218
+ return {
219
+ 'answer': answer,
220
+ 'sources': sources,
221
+ 'context': context
222
+ }
223
+
224
+ def version_inquiry(self, query: str) -> Dict:
225
+ """Handle version-specific inquiries using graph"""
226
+ if self.graph_manager:
227
+ documents = self.graph_manager.get_all_documents()
228
+
229
+ relevant_docs = []
230
+ query_lower = query.lower()
231
+ for doc in documents:
232
+ if any(word in doc.lower() for word in query_lower.split()):
233
+ relevant_docs.append(doc)
234
+
235
+ if relevant_docs:
236
+ answer = f"Found version information for {len(relevant_docs)} document(s):\n\n"
237
+ versions_found = []
238
+
239
+ for doc in relevant_docs:
240
+ versions = self.graph_manager.get_document_versions(doc)
241
+ versions_found.extend(versions)
242
+ answer += f"**{doc}**\n"
243
+ answer += f"- Versions: {', '.join(versions)}\n"
244
+
245
+ for version in versions:
246
+ info = self.graph_manager.get_version_info(doc, version)
247
+ if info:
248
+ answer += f" - {version}: {info.get('timestamp', 'N/A')}\n"
249
+ answer += "\n"
250
+
251
+ return {
252
+ 'answer': answer,
253
+ 'sources': [],
254
+ 'versions': list(set(versions_found))
255
+ }
256
+
257
+ # Fallback to vector search
258
+ query_embedding = self.embeddings.embed_query(query)
259
+
260
+ results = self.collection.query(
261
+ query_embeddings=[query_embedding],
262
+ n_results=20,
263
+ where={
264
+ "$and": [
265
+ {"tenant_id": self.user_id},
266
+ {"doc_type": "content"}
267
+ ]
268
+ }
269
+ )
270
+
271
+ versions = set()
272
+ version_info = {}
273
+
274
+ for meta in results['metadatas'][0]:
275
+ version = meta.get('version', 'N/A')
276
+ if version != 'N/A':
277
+ versions.add(version)
278
+ if version not in version_info:
279
+ version_info[version] = {
280
+ 'filename': meta.get('filename', 'N/A'),
281
+ 'domain': meta.get('domain', 'N/A'),
282
+ 'topic': meta.get('topic', 'N/A')
283
+ }
284
+
285
+ version_list = ", ".join(sorted(versions))
286
+ answer = f"Found {len(versions)} version(s): {version_list}\n\n"
287
+
288
+ for version in sorted(versions):
289
+ info = version_info[version]
290
+ answer += f"- **{version}**: {info['topic']} ({info['domain']})\n"
291
+
292
+ return {
293
+ 'answer': answer,
294
+ 'sources': [],
295
+ 'versions': list(versions)
296
+ }
297
+
298
+ def change_retrieval(self, query: str) -> Dict:
299
+ """Retrieve change information between versions"""
300
+ query_embedding = self.embeddings.embed_query(query)
301
+
302
+ try:
303
+ results = self.collection.query(
304
+ query_embeddings=[query_embedding],
305
+ n_results=10,
306
+ where={
307
+ "$and": [
308
+ {"tenant_id": self.user_id},
309
+ {"doc_type": "change"}
310
+ ]
311
+ }
312
+ )
313
+ except:
314
+ results = self.collection.query(
315
+ query_embeddings=[query_embedding],
316
+ n_results=10,
317
+ where={"tenant_id": self.user_id}
318
+ )
319
+
320
+ if results['documents'][0] and results['metadatas'][0]:
321
+ changes = []
322
+ for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
323
+ if meta.get('doc_type') == 'change':
324
+ changes.append({
325
+ 'content': doc,
326
+ 'version': meta.get('version', 'N/A'),
327
+ 'change_type': meta.get('change_type', 'unknown'),
328
+ 'filename': meta.get('filename', 'N/A'),
329
+ 'topic': meta.get('topic', 'N/A')
330
+ })
331
+
332
+ if changes:
333
+ answer = "Changes detected:\n\n"
334
+ for change in changes[:5]:
335
+ answer += f"**[{change['version']} - {change['change_type'].upper()}]**\n"
336
+ answer += f"Topic: {change['topic']}\n"
337
+ answer += f"{change['content']}\n\n"
338
+
339
+ return {
340
+ 'answer': answer,
341
+ 'sources': changes
342
+ }
343
+
344
+ context_results = self.collection.query(
345
+ query_embeddings=[query_embedding],
346
+ n_results=5,
347
+ where={"tenant_id": self.user_id}
348
+ )
349
+
350
+ if context_results['documents'][0]:
351
+ context = "\n\n".join(context_results['documents'][0])
352
+ prompt = f"""Based on the context, identify and describe any changes, additions, deletions, or modifications mentioned.
353
+
354
+ Context:
355
+ {context}
356
+
357
+ Question: {query}
358
+
359
+ Answer:"""
360
+
361
+ try:
362
+ response = self.llm.invoke(prompt)
363
+ answer = response.content if hasattr(response, 'content') else str(response)
364
+ except:
365
+ answer = "Unable to determine changes."
366
+ else:
367
+ answer = "No change information found."
368
+
369
+ return {
370
+ 'answer': answer,
371
+ 'sources': context_results['metadatas'][0][:5] if context_results['metadatas'][0] else []
372
+ }
373
+
374
+
375
+ class BaselineRAG:
376
+ """Standard RAG system without version awareness"""
377
+
378
+ def __init__(self, user_id: str, model_name: str = "gpt-3.5-turbo",
379
+ embedding_model: str = "text-embedding-3-small"):
380
+ self.user_id = user_id
381
+ self.model_name = model_name
382
+
383
+ # Initialize embeddings - Using OpenAI
384
+ self.embeddings = OpenAIEmbeddings(model=embedding_model)
385
+
386
+ persist_dir = f"./chroma_baseline_{user_id}"
387
+ os.makedirs(persist_dir, exist_ok=True)
388
+
389
+ self.chroma_client = chromadb.PersistentClient(path=persist_dir)
390
+
391
+ collection_name = f"baseline_{user_id}"
392
+ try:
393
+ self.collection = self.chroma_client.get_collection(name=collection_name)
394
+ except:
395
+ self.collection = self.chroma_client.create_collection(name=collection_name)
396
+
397
+ self.llm = ChatOpenAI(
398
+ model_name=model_name,
399
+ temperature=0
400
+ )
401
+
402
+ self.text_splitter = RecursiveCharacterTextSplitter(
403
+ chunk_size=1000,
404
+ chunk_overlap=200
405
+ )
406
+
407
+ def add_documents(self, texts: List[str], metadatas: List[Dict]):
408
+ """Add documents to vector store"""
409
+ all_chunks = []
410
+ all_metadatas = []
411
+ all_ids = []
412
+
413
+ for text, metadata in zip(texts, metadatas):
414
+ chunks = self.text_splitter.split_text(text)
415
+
416
+ for chunk in chunks:
417
+ all_chunks.append(chunk)
418
+ all_metadatas.append(metadata.copy())
419
+ all_ids.append(f"baseline_{self.user_id}_{uuid.uuid4()}")
420
+
421
+ if all_chunks:
422
+ embeddings = self.embeddings.embed_documents(all_chunks)
423
+
424
+ self.collection.add(
425
+ embeddings=embeddings,
426
+ documents=all_chunks,
427
+ metadatas=all_metadatas,
428
+ ids=all_ids
429
+ )
430
+
431
+ def query(self, query: str, top_k: int = 5) -> Dict:
432
+ """Standard query without version awareness"""
433
+ query_embedding = self.embeddings.embed_query(query)
434
+
435
+ try:
436
+ results = self.collection.query(
437
+ query_embeddings=[query_embedding],
438
+ n_results=top_k
439
+ )
440
+ except Exception as e:
441
+ return {
442
+ 'answer': f"Error: {str(e)}",
443
+ 'sources': []
444
+ }
445
+
446
+ if not results['documents'][0]:
447
+ return {
448
+ 'answer': "No relevant documents found.",
449
+ 'sources': []
450
+ }
451
+
452
+ context = "\n\n".join(results['documents'][0])
453
+
454
+ prompt = f"""Based on the following context, answer the question.
455
+
456
+ Context:
457
+ {context}
458
+
459
+ Question: {query}
460
+
461
+ Answer:"""
462
+
463
+ try:
464
+ response = self.llm.invoke(prompt)
465
+ answer = response.content if hasattr(response, 'content') else str(response)
466
+ except Exception as e:
467
+ answer = f"Error: {str(e)}"
468
+
469
+ sources = [
470
+ {'content': doc, 'metadata': meta}
471
+ for doc, meta in zip(results['documents'][0], results['metadatas'][0])
472
+ ]
473
+
474
+ return {
475
+ 'answer': answer,
476
+ 'sources': sources
477
+ }