Spaces:

shahbazdev0
/

VersionRAG

Sleeping

+# app.py - Main Streamlit Application
+import streamlit as st
+import os
+import json
+import hashlib
+import time
+from datetime import datetime
+from pathlib import Path
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+from typing import List, Dict, Optional, Tuple
+import uuid
+# Import custom modules
+from version_rag import VersionRAG, BaselineRAG
+from graph_manager import GraphManager
+from evaluation import Evaluator, VersionQADataset
+from utils import DocumentProcessor, ChangeDetector, PersistentStorage
+# Page configuration
+st.set_page_config(
+    page_title="VersionRAG - Version-Aware RAG System",
+    page_icon="📚",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Initialize session state
+def init_session_state():
+    if 'user_id' not in st.session_state:
+        st.session_state.user_id = str(uuid.uuid4())
+    if 'version_rag' not in st.session_state:
+        st.session_state.version_rag = None
+    if 'baseline_rag' not in st.session_state:
+        st.session_state.baseline_rag = None
+    if 'graph_manager' not in st.session_state:
+        st.session_state.graph_manager = None
+    if 'uploaded_files' not in st.session_state:
+        st.session_state.uploaded_files = {}
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
+    if 'evaluation_results' not in st.session_state:
+        st.session_state.evaluation_results = None
+    if 'feedback_data' not in st.session_state:
+        st.session_state.feedback_data = []
+    if 'persistent_storage' not in st.session_state:
+        st.session_state.persistent_storage = None
+init_session_state()
+# Custom CSS
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #1f77b4;
+        text-align: center;
+        padding: 1rem 0;
+    }
+    .metric-card {
+        background-color: #f0f2f6;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin: 0.5rem 0;
+    }
+    .diff-added {
+        background-color: #d4edda;
+        padding: 0.2rem 0.5rem;
+        border-radius: 0.3rem;
+    }
+    .diff-removed {
+        background-color: #f8d7da;
+        padding: 0.2rem 0.5rem;
+        border-radius: 0.3rem;
+    }
+    .version-tag {
+        background-color: #e7f3ff;
+        color: #0366d6;
+        padding: 0.2rem 0.5rem;
+        border-radius: 0.3rem;
+        font-weight: bold;
+    }
+    .stTabs [data-baseweb="tab-list"] {
+        gap: 2rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Sidebar
+with st.sidebar:
+    st.markdown("### 🔐 User Session")
+    st.info(f"User ID: {st.session_state.user_id[:8]}...")
+    st.markdown("### ⚙️ Settings")
+    # API Key input
+    api_key = st.text_input("OpenAI API Key", type="password",
+                           value=os.getenv("OPENAI_API_KEY", ""))
+    if api_key:
+        os.environ["OPENAI_API_KEY"] = api_key
+    # Model selection
+    model_name = st.selectbox(
+        "LLM Model",
+        ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo-preview"],
+        index=0
+    )
+    # Embedding model
+    embedding_model = st.selectbox(
+        "Embedding Model",
+        ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"],  # ✅ CORRECT
+        index=0
+    )
+    # Retrieval parameters
+    st.markdown("### 🎯 Retrieval Parameters")
+    top_k = st.slider("Top K Results", 1, 10, 5)
+    similarity_threshold = st.slider("Similarity Threshold", 0.0, 1.0, 0.7)
+    # Initialize systems button
+    if st.button("🚀 Initialize Systems", type="primary"):
+        with st.spinner("Initializing VersionRAG and Baseline systems..."):
+            try:
+                st.session_state.version_rag = VersionRAG(
+                    user_id=st.session_state.user_id,
+                    model_name=model_name,
+                    embedding_model=embedding_model
+                )
+                st.session_state.baseline_rag = BaselineRAG(
+                    user_id=st.session_state.user_id,
+                    model_name=model_name,
+                    embedding_model=embedding_model
+                )
+                st.session_state.graph_manager = GraphManager(
+                    user_id=st.session_state.user_id
+                )
+                st.success("✅ Systems initialized successfully!")
+            except Exception as e:
+                st.error(f"❌ Initialization error: {str(e)}")
+    # Knowledge base status
+    if st.session_state.uploaded_files:
+        st.markdown("### 📚 Knowledge Base")
+        for filename, info in st.session_state.uploaded_files.items():
+            with st.expander(f"📄 {filename}"):
+                st.write(f"**Version:** {info['version']}")
+                st.write(f"**Uploaded:** {info['timestamp']}")
+                st.write(f"**Hash:** {info['hash'][:12]}...")
+# Main content
+st.markdown('<div class="main-header">📚 VersionRAG: Version-Aware RAG System</div>',
+            unsafe_allow_html=True)
+# Create tabs
+tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
+    "📤 Document Upload",
+    "💬 Query Interface",
+    "📊 Evaluation",
+    "🔍 Version Explorer",
+    "📈 Analytics",
+    "👥 Multi-User Management"
+])
+# Tab 1: Document Upload
+with tab1:
+    st.header("Document Upload & Indexing")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        uploaded_files = st.file_uploader(
+            "Upload versioned documents (PDF, TXT)",
+            type=["pdf", "txt"],
+            accept_multiple_files=True
+        )
+        if uploaded_files:
+            st.markdown("### 📋 File Metadata")
+            for idx, file in enumerate(uploaded_files):
+                with st.expander(f"📄 {file.name}", expanded=True):
+                    col_a, col_b = st.columns(2)
+                    with col_a:
+                        version = st.text_input(
+                            "Version",
+                            key=f"version_{idx}",
+                            value="1.0.0"
+                        )
+                    with col_b:
+                        domain = st.selectbox(
+                            "Domain",
+                            ["Software", "Healthcare", "Finance", "Industrial", "Other"],
+                            key=f"domain_{idx}"
+                        )
+                    topic = st.text_input(
+                        "Topic/Module",
+                        key=f"topic_{idx}",
+                        value=file.name.split('.')[0]
+                    )
+                    if st.button(f"Process {file.name}", key=f"process_{idx}"):
+                        if not st.session_state.version_rag:
+                            st.error("Please initialize systems first!")
+                        else:
+                            with st.spinner(f"Processing {file.name}..."):
+                                try:
+                                    # Read file content
+                                    content = file.read()
+                                    if file.type == "application/pdf":
+                                        text = DocumentProcessor.extract_text_from_pdf(content)
+                                    else:
+                                        text = content.decode('utf-8')
+                                    # Calculate hash
+                                    file_hash = hashlib.sha256(content).hexdigest()
+                                    # Check if file already exists
+                                    if file.name in st.session_state.uploaded_files:
+                                        old_hash = st.session_state.uploaded_files[file.name]['hash']
+                                        if old_hash == file_hash:
+                                            st.info("File unchanged, skipping indexing.")
+                                            continue
+                                        else:
+                                            st.info("File changed, re-indexing with diff analysis...")
+                                            # Perform diff analysis
+                                            old_text = st.session_state.uploaded_files[file.name]['text']
+                                            changes = ChangeDetector.compute_diff(old_text, text)
+                                            # Add to graph
+                                            st.session_state.graph_manager.add_version_with_changes(
+                                                document_name=topic,
+                                                version=version,
+                                                changes=changes
+                                            )
+                                    # Add to VersionRAG
+                                    st.session_state.version_rag.add_documents(
+                                        texts=[text],
+                                        metadatas=[{
+                                            'filename': file.name,
+                                            'version': version,
+                                            'domain': domain,
+                                            'topic': topic,
+                                            'hash': file_hash,
+                                            'timestamp': datetime.now().isoformat()
+                                        }]
+                                    )
+                                    # Add to Baseline RAG
+                                    st.session_state.baseline_rag.add_documents(
+                                        texts=[text],
+                                        metadatas=[{
+                                            'filename': file.name,
+                                            'version': version
+                                        }]
+                                    )
+                                    # Add to graph
+                                    st.session_state.graph_manager.add_document_version(
+                                        document_name=topic,
+                                        version=version,
+                                        content=text,
+                                        metadata={
+                                            'domain': domain,
+                                            'filename': file.name
+                                        }
+                                    )
+                                    # Store in session state
+                                    st.session_state.uploaded_files[file.name] = {
+                                        'version': version,
+                                        'domain': domain,
+                                        'topic': topic,
+                                        'hash': file_hash,
+                                        'text': text,
+                                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                                    }
+                                    st.success(f"✅ Successfully processed {file.name}")
+                                except Exception as e:
+                                    st.error(f"❌ Error processing {file.name}: {str(e)}")
+    with col2:
+        st.markdown("### 📊 Upload Statistics")
+        if st.session_state.uploaded_files:
+            stats_data = {
+                'Total Files': len(st.session_state.uploaded_files),
+                'Domains': len(set(f['domain'] for f in st.session_state.uploaded_files.values())),
+                'Total Versions': len(set(f['version'] for f in st.session_state.uploaded_files.values()))
+            }
+            for key, value in stats_data.items():
+                st.metric(key, value)
+            # Domain distribution
+            domain_counts = {}
+            for file_info in st.session_state.uploaded_files.values():
+                domain = file_info['domain']
+                domain_counts[domain] = domain_counts.get(domain, 0) + 1
+            fig = px.pie(
+                values=list(domain_counts.values()),
+                names=list(domain_counts.keys()),
+                title="Documents by Domain"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+# Tab 2: Query Interface
+with tab2:
+    st.header("Interactive Query Interface")
+    if not st.session_state.version_rag:
+        st.warning("⚠️ Please initialize the systems first from the sidebar!")
+    else:
+        # Query type selection
+        query_type = st.radio(
+            "Query Type",
+            ["Content Retrieval", "Version Inquiry", "Change Retrieval"],
+            horizontal=True
+        )
+        # Query input
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            query = st.text_input(
+                "Enter your query",
+                placeholder="e.g., What is the assert module in Node.js v20.0?"
+            )
+        with col2:
+            compare_mode = st.checkbox("Compare with Baseline", value=True)
+        # Version filter (for content retrieval)
+        if query_type == "Content Retrieval":
+            version_filter = st.text_input(
+                "Version Filter (optional)",
+                placeholder="e.g., 1.2.0"
+            )
+        else:
+            version_filter = None
+        if st.button("🔍 Search", type="primary"):
+            if not query:
+                st.warning("Please enter a query!")
+            else:
+                with st.spinner("Searching..."):
+                    start_time = time.time()
+                    # VersionRAG query
+                    if query_type == "Content Retrieval":
+                        vrag_result = st.session_state.version_rag.query(
+                            query=query,
+                            version_filter=version_filter,
+                            top_k=top_k
+                        )
+                    elif query_type == "Version Inquiry":
+                        vrag_result = st.session_state.version_rag.version_inquiry(
+                            query=query
+                        )
+                    else:  # Change Retrieval
+                        vrag_result = st.session_state.version_rag.change_retrieval(
+                            query=query
+                        )
+                    vrag_time = time.time() - start_time
+                    # Baseline query (if comparison enabled)
+                    if compare_mode:
+                        start_time = time.time()
+                        baseline_result = st.session_state.baseline_rag.query(
+                            query=query,
+                            top_k=top_k
+                        )
+                        baseline_time = time.time() - start_time
+                    # Display results
+                    if compare_mode:
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.markdown("### 🚀 VersionRAG Response")
+                            st.markdown(f"**Response Time:** {vrag_time:.3f}s")
+                            st.markdown("---")
+                            st.markdown(vrag_result['answer'])
+                            if 'sources' in vrag_result:
+                                with st.expander("📚 Sources"):
+                                    for idx, source in enumerate(vrag_result['sources']):
+                                        st.markdown(f"**Source {idx+1}**")
+                                        st.markdown(f"- Version: `{source.get('version', 'N/A')}`")
+                                        st.markdown(f"- File: `{source.get('filename', 'N/A')}`")
+                                        st.markdown(f"- Similarity: {source.get('similarity', 0):.3f}")
+                                        st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
+                        with col2:
+                            st.markdown("### 📊 Baseline RAG Response")
+                            st.markdown(f"**Response Time:** {baseline_time:.3f}s")
+                            st.markdown("---")
+                            st.markdown(baseline_result['answer'])
+                            if 'sources' in baseline_result:
+                                with st.expander("📚 Sources"):
+                                    for idx, source in enumerate(baseline_result['sources']):
+                                        st.markdown(f"**Source {idx+1}**")
+                                        st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
+                    else:
+                        st.markdown("### 🚀 VersionRAG Response")
+                        st.markdown(f"**Response Time:** {vrag_time:.3f}s")
+                        st.markdown("---")
+                        st.markdown(vrag_result['answer'])
+                        if 'sources' in vrag_result:
+                            with st.expander("📚 Sources"):
+                                for idx, source in enumerate(vrag_result['sources']):
+                                    st.markdown(f"**Source {idx+1}**")
+                                    st.markdown(f"- Version: `{source.get('version', 'N/A')}`")
+                                    st.markdown(f"- File: `{source.get('filename', 'N/A')}`")
+                                    st.markdown(f"- Similarity: {source.get('similarity', 0):.3f}")
+                                    st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
+                    # Feedback
+                    st.markdown("### 📝 Feedback")
+                    col1, col2, col3 = st.columns([1, 1, 2])
+                    with col1:
+                        rating = st.slider("Rate this answer", 1, 5, 3)
+                    with col2:
+                        if st.button("Submit Feedback"):
+                            st.session_state.feedback_data.append({
+                                'query': query,
+                                'query_type': query_type,
+                                'rating': rating,
+                                'timestamp': datetime.now().isoformat(),
+                                'response_time': vrag_time
+                            })
+                            st.success("Thank you for your feedback!")
+                    # Add to chat history
+                    st.session_state.chat_history.append({
+                        'query': query,
+                        'query_type': query_type,
+                        'vrag_answer': vrag_result['answer'],
+                        'vrag_time': vrag_time,
+                        'baseline_answer': baseline_result['answer'] if compare_mode else None,
+                        'baseline_time': baseline_time if compare_mode else None,
+                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    })
+        # Chat history
+        if st.session_state.chat_history:
+            st.markdown("### 💭 Query History")
+            for idx, chat in enumerate(reversed(st.session_state.chat_history[-5:])):
+                with st.expander(f"{chat['timestamp']} - {chat['query'][:50]}..."):
+                    st.markdown(f"**Query Type:** {chat['query_type']}")
+                    st.markdown(f"**VersionRAG Answer:** {chat['vrag_answer'][:200]}...")
+                    st.markdown(f"**Response Time:** {chat['vrag_time']:.3f}s")
+# Tab 3: Evaluation
+with tab3:
+    st.header("System Evaluation")
+    if not st.session_state.version_rag:
+        st.warning("⚠️ Please initialize the systems first!")
+    else:
+        st.markdown("""
+        This section evaluates VersionRAG against the baseline system using the Mini-VersionQA dataset.
+        Metrics include Hit@k, MRR, Accuracy, and Version-Sensitive Accuracy (VSA).
+        """)
+        # Evaluation dataset configuration
+        st.markdown("### 📋 Evaluation Dataset Configuration")
+        use_custom_dataset = st.checkbox("Use custom evaluation dataset")
+        if use_custom_dataset:
+            uploaded_qa_file = st.file_uploader(
+                "Upload QA Dataset (JSON)",
+                type=["json"]
+            )
+            if uploaded_qa_file:
+                qa_data = json.load(uploaded_qa_file)
+                st.success(f"Loaded {len(qa_data)} questions")
+        else:
+            st.info("Using default Mini-VersionQA dataset")
+            qa_data = None
+        if st.button("🚀 Run Evaluation", type="primary"):
+            with st.spinner("Running evaluation..."):
+                try:
+                    # Initialize evaluator
+                    evaluator = Evaluator(
+                        version_rag=st.session_state.version_rag,
+                        baseline_rag=st.session_state.baseline_rag
+                    )
+                    # Create or load dataset
+                    if qa_data:
+                        dataset = VersionQADataset.from_dict(qa_data)
+                    else:
+                        dataset = VersionQADataset.create_mini_versionqa()
+                    # Run evaluation
+                    results = evaluator.evaluate(dataset)
+                    st.session_state.evaluation_results = results
+                    # Display results
+                    st.markdown("### 📊 Evaluation Results")
+                    # Overall comparison
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.markdown("#### 🚀 VersionRAG")
+                        st.metric("Accuracy", f"{results['versionrag']['accuracy']:.2%}")
+                        st.metric("Hit@5", f"{results['versionrag']['hit_at_5']:.2%}")
+                        st.metric("MRR", f"{results['versionrag']['mrr']:.3f}")
+                        st.metric("VSA", f"{results['versionrag']['vsa']:.2%}")
+                        st.metric("Avg Latency", f"{results['versionrag']['avg_latency']:.3f}s")
+                    with col2:
+                        st.markdown("#### 📊 Baseline RAG")
+                        st.metric("Accuracy", f"{results['baseline']['accuracy']:.2%}")
+                        st.metric("Hit@5", f"{results['baseline']['hit_at_5']:.2%}")
+                        st.metric("MRR", f"{results['baseline']['mrr']:.3f}")
+                        st.metric("VSA", f"{results['baseline']['vsa']:.2%}")
+                        st.metric("Avg Latency", f"{results['baseline']['avg_latency']:.3f}s")
+                    # Performance improvement
+                    st.markdown("### 📈 Performance Improvement")
+                    improvement = {
+                        'Accuracy': (results['versionrag']['accuracy'] - results['baseline']['accuracy']) * 100,
+                        'Hit@5': (results['versionrag']['hit_at_5'] - results['baseline']['hit_at_5']) * 100,
+                        'MRR': (results['versionrag']['mrr'] - results['baseline']['mrr']) * 100,
+                        'VSA': (results['versionrag']['vsa'] - results['baseline']['vsa']) * 100
+                    }
+                    fig = go.Figure(data=[
+                        go.Bar(name='Improvement', x=list(improvement.keys()),
+                               y=list(improvement.values()),
+                               marker_color='lightblue')
+                    ])
+                    fig.add_hline(y=25, line_dash="dash", line_color="red",
+                                 annotation_text="Target: 25 points")
+                    fig.update_layout(
+                        title="VersionRAG vs Baseline - Performance Improvement (percentage points)",
+                        yaxis_title="Improvement (%)",
+                        showlegend=False
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                    # Query type breakdown
+                    st.markdown("### 🔍 Performance by Query Type")
+                    query_types = ['Content Retrieval', 'Version Inquiry', 'Change Retrieval']
+                    vrag_scores = [
+                        results['versionrag']['by_type']['content_retrieval'],
+                        results['versionrag']['by_type']['version_inquiry'],
+                        results['versionrag']['by_type']['change_retrieval']
+                    ]
+                    baseline_scores = [
+                        results['baseline']['by_type']['content_retrieval'],
+                        results['baseline']['by_type']['version_inquiry'],
+                        results['baseline']['by_type']['change_retrieval']
+                    ]
+                    fig = go.Figure(data=[
+                        go.Bar(name='VersionRAG', x=query_types, y=vrag_scores),
+                        go.Bar(name='Baseline', x=query_types, y=baseline_scores)
+                    ])
+                    fig.update_layout(
+                        title="Accuracy by Query Type",
+                        yaxis_title="Accuracy (%)",
+                        barmode='group'
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                    # Success criteria check
+                    st.markdown("### ✅ Success Criteria")
+                    criteria = {
+                        'VSA Improvement ≥ 25 points': improvement['VSA'] >= 25,
+                        'Content Retrieval ≥ 85%': vrag_scores[0] >= 85,
+                        'Version Inquiry ≥ 90%': vrag_scores[1] >= 90,
+                        'Change Retrieval ≥ 60%': vrag_scores[2] >= 60
+                    }
+                    for criterion, passed in criteria.items():
+                        if passed:
+                            st.success(f"✅ {criterion}")
+                        else:
+                            st.error(f"❌ {criterion}")
+                except Exception as e:
+                    st.error(f"Evaluation error: {str(e)}")
+# Tab 4: Version Explorer
+with tab4:
+    st.header("Version Explorer")
+    if not st.session_state.graph_manager:
+        st.warning("⚠️ Please initialize the systems first!")
+    else:
+        # Document selection
+        documents = st.session_state.graph_manager.get_all_documents()
+        if not documents:
+            st.info("No documents uploaded yet. Please upload documents in the 'Document Upload' tab.")
+        else:
+            selected_doc = st.selectbox("Select Document", documents)
+            if selected_doc:
+                # Get versions for selected document
+                versions = st.session_state.graph_manager.get_document_versions(selected_doc)
+                st.markdown(f"### 📚 {selected_doc}")
+                st.markdown(f"**Total Versions:** {len(versions)}")
+                # Version timeline
+                if len(versions) > 1:
+                    st.markdown("### 📅 Version Timeline")
+                    timeline_data = []
+                    for v in sorted(versions):
+                        version_info = st.session_state.graph_manager.get_version_info(
+                            selected_doc, v
+                        )
+                        timeline_data.append({
+                            'Version': v,
+                            'Date': version_info.get('timestamp', 'N/A')
+                        })
+                    df = pd.DataFrame(timeline_data)
+                    st.dataframe(df, use_container_width=True)
+                # Version comparison
+                st.markdown("### 🔄 Version Comparison")
+                col1, col2 = st.columns(2)
+                with col1:
+                    version1 = st.selectbox("Version 1", sorted(versions), index=0)
+                with col2:
+                    version2 = st.selectbox("Version 2", sorted(versions),
+                                          index=min(1, len(versions)-1))
+                if version1 and version2 and version1 != version2:
+                    if st.button("Compare Versions"):
+                        with st.spinner("Computing differences..."):
+                            changes = st.session_state.graph_manager.get_changes_between_versions(
+                                selected_doc, version1, version2
+                            )
+                            st.markdown("### 📝 Changes Detected")
+                            if changes['additions']:
+                                st.markdown("#### ➕ Additions")
+                                for add in changes['additions']:
+                                    st.markdown(f'<div class="diff-added">{add}</div>',
+                                              unsafe_allow_html=True)
+                            if changes['deletions']:
+                                st.markdown("#### ➖ Deletions")
+                                for delete in changes['deletions']:
+                                    st.markdown(f'<div class="diff-removed">{delete}</div>',
+                                              unsafe_allow_html=True)
+                            if changes['modifications']:
+                                st.markdown("#### 🔄 Modifications")
+                                for mod in changes['modifications']:
+                                    st.markdown(f"- {mod}")
+                            # Visualize changes
+                            st.markdown("### 📊 Change Statistics")
+                            change_stats = {
+                                'Additions': len(changes['additions']),
+                                'Deletions': len(changes['deletions']),
+                                'Modifications': len(changes['modifications'])
+                            }
+                            fig = px.bar(
+                                x=list(change_stats.keys()),
+                                y=list(change_stats.values()),
+                                title=f"Changes from {version1} to {version2}",
+                                labels={'x': 'Change Type', 'y': 'Count'}
+                            )
+                            st.plotly_chart(fig, use_container_width=True)
+# Tab 5: Analytics
+with tab5:
+    st.header("System Analytics")
+    # System statistics
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Total Queries", len(st.session_state.chat_history))
+    with col2:
+        if st.session_state.feedback_data:
+            avg_rating = sum(f['rating'] for f in st.session_state.feedback_data) / len(st.session_state.feedback_data)
+            st.metric("Avg Rating", f"{avg_rating:.2f} / 5")
+        else:
+            st.metric("Avg Rating", "N/A")
+    with col3:
+        if st.session_state.chat_history:
+            avg_response_time = sum(c['vrag_time'] for c in st.session_state.chat_history) / len(st.session_state.chat_history)
+            st.metric("Avg Response Time", f"{avg_response_time:.3f}s")
+        else:
+            st.metric("Avg Response Time", "N/A")
+    with col4:
+        st.metric("Total Documents", len(st.session_state.uploaded_files))
+    # Query type distribution
+    if st.session_state.chat_history:
+        st.markdown("### 📊 Query Type Distribution")
+        query_type_counts = {}
+        for chat in st.session_state.chat_history:
+            qtype = chat['query_type']
+            query_type_counts[qtype] = query_type_counts.get(qtype, 0) + 1
+        fig = px.pie(
+            values=list(query_type_counts.values()),
+            names=list(query_type_counts.keys()),
+            title="Distribution of Query Types"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    # Response time trend
+    if len(st.session_state.chat_history) > 1:
+        st.markdown("### ⏱️ Response Time Trend")
+        times = [c['vrag_time'] for c in st.session_state.chat_history]
+        fig = go.Figure(data=go.Scatter(
+            y=times,
+            mode='lines+markers',
+            name='Response Time'
+        ))
+        fig.update_layout(
+            title="Response Time Over Queries",
+            xaxis_title="Query Number",
+            yaxis_title="Response Time (s)"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    # Feedback analysis
+    if st.session_state.feedback_data:
+        st.markdown("### 📝 User Feedback Analysis")
+        # Rating distribution
+        rating_counts = {}
+        for feedback in st.session_state.feedback_data:
+            rating = feedback['rating']
+            rating_counts[rating] = rating_counts.get(rating, 0) + 1
+        fig = go.Figure(data=[
+            go.Bar(x=list(rating_counts.keys()), y=list(rating_counts.values()))
+        ])
+        fig.update_layout(
+            title="Rating Distribution",
+            xaxis_title="Rating",
+            yaxis_title="Count"
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    # Export analytics
+    st.markdown("### 💾 Export Data")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("Export Chat History"):
+            if st.session_state.chat_history:
+                df = pd.DataFrame(st.session_state.chat_history)
+                csv = df.to_csv(index=False)
+                st.download_button(
+                    "Download CSV",
+                    csv,
+                    "chat_history.csv",
+                    "text/csv"
+                )
+    with col2:
+        if st.button("Export Feedback Data"):
+            if st.session_state.feedback_data:
+                df = pd.DataFrame(st.session_state.feedback_data)
+                csv = df.to_csv(index=False)
+                st.download_button(
+                    "Download CSV",
+                    csv,
+                    "feedback_data.csv",
+                    "text/csv"
+                )
+# Tab 6: Multi-User Management
+with tab6:
+    st.header("Multi-User Management")
+    st.markdown("""
+    This section demonstrates VersionRAG's multi-user capabilities with logical data separation
+    and persistent knowledge base management.
+    """)
+    # User session info
+    st.markdown("### 👤 Current Session")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.info(f"**User ID:** {st.session_state.user_id[:16]}...")
+    with col2:
+        st.info(f"**Documents:** {len(st.session_state.uploaded_files)}")
+    with col3:
+        st.info(f"**Queries:** {len(st.session_state.chat_history)}")
+    # Data isolation demonstration
+    st.markdown("### 🔒 Data Isolation")
+    st.markdown("""
+    Each user's knowledge base is logically separated using `tenant_id` metadata in ChromaDB.
+    This ensures:
+    - No data leakage between users
+    - Independent query results
+    - Isolated document management
+    """)
+    # Knowledge base status
+    st.markdown("### 📚 Knowledge Base Status")
+    if st.session_state.uploaded_files:
+        kb_data = []
+        for filename, info in st.session_state.uploaded_files.items():
+            kb_data.append({
+                'File': filename,
+                'Version': info['version'],
+                'Domain': info['domain'],
+                'Topic': info['topic'],
+                'Uploaded': info['timestamp'],
+                'Hash': info['hash'][:12] + "..."
+            })
+        df = pd.DataFrame(kb_data)
+        st.dataframe(df, use_container_width=True)
+        # Persistent storage info
+        st.success("""
+        ✅ **Persistent Storage Active**
+        - All documents are stored with file hash tracking
+        - Unchanged files skip re-indexing
+        - Automatic diff-based updates for modified files
+        """)
+    else:
+        st.info("No documents in knowledge base. Upload documents to get started.")
+    # Session management
+    st.markdown("### 🔄 Session Management")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("🆕 Create New Session"):
+            if st.checkbox("Confirm session reset"):
+                st.session_state.user_id = str(uuid.uuid4())
+                st.session_state.version_rag = None
+                st.session_state.baseline_rag = None
+                st.session_state.graph_manager = None
+                st.session_state.uploaded_files = {}
+                st.session_state.chat_history = []
+                st.success("New session created!")
+                st.rerun()
+    with col2:
+        if st.button("💾 Export Session Data"):
+            session_data = {
+                'user_id': st.session_state.user_id,
+                'uploaded_files': st.session_state.uploaded_files,
+                'chat_history': st.session_state.chat_history,
+                'feedback_data': st.session_state.feedback_data,
+                'timestamp': datetime.now().isoformat()
+            }
+            json_str = json.dumps(session_data, indent=2)
+            st.download_button(
+                "Download Session JSON",
+                json_str,
+                f"session_{st.session_state.user_id[:8]}.json",
+                "application/json"
+            )
+    # UX Metrics
+    st.markdown("### 📊 UX Metrics")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        # Calculate reupload count (files with same name but different hash)
+        reupload_count = 0
+        st.metric("Reupload Count", reupload_count,
+                 help="Number of times files were reuploaded")
+    with col2:
+        if st.session_state.chat_history:
+            avg_response = sum(c['vrag_time'] for c in st.session_state.chat_history) / len(st.session_state.chat_history)
+            st.metric("Avg Response Time", f"{avg_response:.3f}s")
+        else:
+            st.metric("Avg Response Time", "N/A")
+    with col3:
+        cross_contamination = 0  # This would be detected in production
+        st.metric("Cross-User Contamination", cross_contamination,
+                 help="Number of cross-user data leakage incidents")
+# Footer
+st.markdown("---")
+st.markdown("""
+<div style='text-align: center; color: #666;'>
+    <p>VersionRAG - Version-Aware Retrieval-Augmented Generation System</p>
+    <p>Built with Streamlit, LangChain, and ChromaDB</p>
+</div>
+""", unsafe_allow_html=True)

create_sample_dataset.py ADDED Viewed

	@@ -0,0 +1,1228 @@

+# create_sample_dataset.py - Generate Sample Documents for Mini-VersionQA
+import os
+SAMPLE_DOCS = {
+    # Node.js Assert - 3 versions
+    "nodejs_assert_v20.0.txt": """# Node.js Assert Module v20.0
+The assert module provides a set of assertion functions for testing invariants in your code.
+## Overview
+The assert module is used for writing tests. It provides functions to verify that your code is working as expected.
+## Basic Usage
+```javascript
+const assert = require('assert');
+// Strict equality assertion
+assert.strictEqual(1, 1);
+// Deep equality assertion
+assert.deepStrictEqual({a: 1}, {a: 1});
+```
+## Available Functions
+- assert.ok(value): Tests if value is truthy
+- assert.strictEqual(actual, expected): Tests strict equality
+- assert.deepStrictEqual(actual, expected): Tests deep equality
+- assert.notStrictEqual(actual, expected): Tests strict inequality
+- assert.throws(fn): Tests if function throws an error
+## Error Messages
+When assertions fail, the assert module provides detailed error messages showing:
+- The actual value received
+- The expected value
+- The assertion type that failed
+Version: v20.0
+Released: 2023-04
+""",
+    "nodejs_assert_v21.0.txt": """# Node.js Assert Module v21.0
+The assert module provides a set of assertion functions for testing invariants in your code.
+## Overview
+The assert module is used for writing tests. It provides functions to verify that your code is working as expected.
+## NEW in v21.0: Strict Mode
+**MAJOR ADDITION**: The assert module now includes a strict mode by default!
+```javascript
+const assert = require('assert').strict;
+// All assertions now use strict equality by default
+assert.equal(1, 1); // Now uses strictEqual internally
+```
+## Basic Usage
+```javascript
+const assert = require('assert');
+// Strict equality assertion
+assert.strictEqual(1, 1);
+// Deep equality assertion
+assert.deepStrictEqual({a: 1}, {a: 1});
+```
+## Available Functions
+- assert.ok(value): Tests if value is truthy
+- assert.strictEqual(actual, expected): Tests strict equality
+- assert.deepStrictEqual(actual, expected): Tests deep equality
+- assert.notStrictEqual(actual, expected): Tests strict inequality
+- assert.throws(fn): Tests if function throws an error
+- **NEW**: assert.rejects(promise): Tests if promise rejects (async support)
+- **NEW**: assert.strict: Strict mode enabled by default
+## Strict Mode Benefits
+- Prevents common mistakes with type coercion
+- Enforces strict equality checks
+- Better error messages for mismatched types
+## Error Messages
+When assertions fail, the assert module provides detailed error messages showing:
+- The actual value received
+- The expected value
+- The assertion type that failed
+- Stack trace for debugging
+Version: v21.0
+Released: 2023-10
+""",
+    "nodejs_assert_v23.0.txt": """# Node.js Assert Module v23.0
+The assert module provides a set of assertion functions for testing invariants in your code.
+## Overview
+The assert module is used for writing tests. It provides functions to verify that your code is working as expected.
+## Strict Mode (Added in v21.0)
+The assert module includes a strict mode by default:
+```javascript
+const assert = require('assert').strict;
+// All assertions now use strict equality by default
+assert.equal(1, 1); // Uses strictEqual internally
+```
+## NEW in v23.0: Enhanced Diff Output
+**MAJOR IMPROVEMENT**: Better visualization of differences in failed assertions!
+```javascript
+// Now shows colored diff output for complex objects
+assert.deepStrictEqual(
+  { user: { name: 'John', age: 30 } },
+  { user: { name: 'Jane', age: 30 } }
+);
+// Output shows highlighted differences with + and - markers
+```
+## Available Functions
+- assert.ok(value): Tests if value is truthy
+- assert.strictEqual(actual, expected): Tests strict equality
+- assert.deepStrictEqual(actual, expected): Tests deep equality with enhanced diff
+- assert.notStrictEqual(actual, expected): Tests strict inequality
+- assert.throws(fn): Tests if function throws an error
+- assert.rejects(promise): Tests if promise rejects (async support)
+- assert.strict: Strict mode enabled by default
+- **NEW**: assert.match(string, regexp): Tests string against regexp
+- **NEW**: assert.snapshot(value, snapshot): Snapshot testing support
+## Strict Mode Benefits
+- Prevents common mistakes with type coercion
+- Enforces strict equality checks
+- Better error messages for mismatched types
+## Enhanced Error Messages (v23.0)
+- Color-coded diff output
+- Side-by-side comparison for objects
+- Detailed stack traces with source maps
+- Performance metrics for failed assertions
+Version: v23.0
+Released: 2024-04
+""",
+    # Bootstrap - 2 versions
+    "bootstrap_v5.2.txt": """# Bootstrap v5.2 Documentation
+## Grid System
+Bootstrap includes a powerful mobile-first flexbox grid system for building layouts of all shapes and sizes.
+### Grid Classes
+The grid system uses a series of containers, rows, and columns to layout and align content.
+#### Container Classes
+- `.container`: Fixed-width container
+- `.container-fluid`: Full-width container
+- `.container-{breakpoint}`: Responsive container
+#### Row Classes
+- `.row`: Creates a grid row
+- `.row-cols-*`: Set number of columns
+#### Column Classes
+- `.col`: Equal-width columns
+- `.col-{breakpoint}`: Responsive columns
+- `.col-{number}`: Sized columns (1-12)
+- `.col-{breakpoint}-{number}`: Responsive sized columns
+### Responsive Breakpoints
+- xs: <576px
+- sm: ≥576px
+- md: ≥768px
+- lg: ≥992px
+- xl: ≥1200px
+- xxl: ≥1400px
+### Example Usage
+```html
+<div class="container">
+  <div class="row">
+    <div class="col-md-4">Column 1</div>
+    <div class="col-md-4">Column 2</div>
+    <div class="col-md-4">Column 3</div>
+  </div>
+</div>
+```
+### Grid Gutters
+- `.g-*`: Gutter spacing (0-5)
+- `.gx-*`: Horizontal gutters
+- `.gy-*`: Vertical gutters
+Version: v5.2
+Released: 2022-07
+""",
+    "bootstrap_v5.3.txt": """# Bootstrap v5.3 Documentation
+## Grid System
+Bootstrap includes a powerful mobile-first flexbox grid system for building layouts of all shapes and sizes.
+### Grid Classes
+The grid system uses a series of containers, rows, and columns to layout and align content.
+#### Container Classes
+- `.container`: Fixed-width container
+- `.container-fluid`: Full-width container
+- `.container-{breakpoint}`: Responsive container
+#### Row Classes
+- `.row`: Creates a grid row
+- `.row-cols-*`: Set number of columns
+#### Column Classes
+- `.col`: Equal-width columns
+- `.col-{breakpoint}`: Responsive columns
+- `.col-{number}`: Sized columns (1-12)
+- `.col-{breakpoint}-{number}`: Responsive sized columns
+### Responsive Breakpoints
+- xs: <576px
+- sm: ≥576px
+- md: ≥768px
+- lg: ≥992px
+- xl: ≥1200px
+- xxl: ≥1400px
+### Example Usage
+```html
+<div class="container">
+  <div class="row">
+    <div class="col-md-4">Column 1</div>
+    <div class="col-md-4">Column 2</div>
+    <div class="col-md-4">Column 3</div>
+  </div>
+</div>
+```
+### Grid Gutters
+- `.g-*`: Gutter spacing (0-5)
+- `.gx-*`: Horizontal gutters
+- `.gy-*`: Vertical gutters
+## NEW in v5.3: Utility Classes
+### Extended Color Utilities
+**ADDITION**: New color utility classes for more granular control:
+- `.text-primary-emphasis`
+- `.text-secondary-emphasis`
+- `.bg-primary-subtle`
+- `.bg-secondary-subtle`
+- `.border-primary-subtle`
+### Extended Spacing Utilities
+**ADDITION**: New spacing utilities:
+- `.p-*`: Padding (now includes half-step increments)
+- `.m-*`: Margin (now includes half-step increments)
+- Example: `.p-2-5` for padding of 0.625rem
+### Focus Ring Utilities
+**NEW FEATURE**: Custom focus ring utilities:
+- `.focus-ring`
+- `.focus-ring-{color}`
+- Provides accessible focus indicators
+### Link Utilities
+**IMPROVEMENT**: Enhanced link utilities:
+- `.link-opacity-*`: Control link opacity (10-100)
+- `.link-underline-opacity-*`: Control underline opacity
+- Better accessibility for link states
+### Example New Utilities
+```html
+<div class="bg-primary-subtle p-2-5">
+  <a href="#" class="link-opacity-75">Accessible Link</a>
+</div>
+```
+Version: v5.3
+Released: 2023-05
+Changes from v5.2: Added emphasis colors, extended spacing, focus ring utilities, enhanced link controls
+""",
+    # Spark - 2 versions
+    "spark_v3.0.txt": """# Apache Spark v3.0 Documentation
+## DataFrame API
+DataFrames are distributed collections of data organized into named columns, conceptually equivalent to tables in relational databases.
+### Creating DataFrames
+```python
+from pyspark.sql import SparkSession
+spark = SparkSession.builder.appName("example").getOrCreate()
+# From list
+df = spark.createDataFrame([(1, "John"), (2, "Jane")], ["id", "name"])
+# From RDD
+rdd = spark.sparkContext.parallelize([(1, "John"), (2, "Jane")])
+df = spark.createDataFrame(rdd, ["id", "name"])
+# From file
+df = spark.read.csv("data.csv", header=True, inferSchema=True)
+```
+### DataFrame Operations
+#### Select
+```python
+df.select("name").show()
+df.select(df["name"], df["id"] + 1).show()
+```
+#### Filter
+```python
+df.filter(df["id"] > 1).show()
+df.where(df["name"] == "John").show()
+```
+#### GroupBy
+```python
+df.groupBy("name").count().show()
+df.groupBy("department").agg({"salary": "avg"}).show()
+```
+#### Join
+```python
+df1.join(df2, df1["id"] == df2["id"], "inner").show()
+```
+### Schema Definition
+```python
+from pyspark.sql.types import StructType, StructField, StringType, IntegerType
+schema = StructType([
+    StructField("id", IntegerType(), True),
+    StructField("name", StringType(), True)
+])
+df = spark.createDataFrame(data, schema)
+```
+### Data Types
+- IntegerType, LongType, FloatType, DoubleType
+- StringType, BinaryType
+- BooleanType
+- DateType, TimestampType
+- ArrayType, MapType, StructType
+Version: v3.0
+Released: 2020-06
+""",
+    "spark_v3.5.txt": """# Apache Spark v3.5 Documentation
+## DataFrame API
+DataFrames are distributed collections of data organized into named columns, conceptually equivalent to tables in relational databases.
+### Creating DataFrames
+```python
+from pyspark.sql import SparkSession
+spark = SparkSession.builder.appName("example").getOrCreate()
+# From list
+df = spark.createDataFrame([(1, "John"), (2, "Jane")], ["id", "name"])
+# From file (improved in v3.5)
+df = spark.read.csv("data.csv", header=True, inferSchema=True)
+df = spark.read.json("data.json")
+df = spark.read.parquet("data.parquet")
+```
+### DataFrame Operations
+#### Select
+```python
+df.select("name").show()
+df.select(df["name"], df["id"] + 1).show()
+```
+#### Filter
+```python
+df.filter(df["id"] > 1).show()
+df.where(df["name"] == "John").show()
+```
+#### GroupBy
+```python
+df.groupBy("name").count().show()
+df.groupBy("department").agg({"salary": "avg"}).show()
+```
+#### Join (Enhanced in v3.5)
+```python
+# New: Support for multiple join types
+df1.join(df2, df1["id"] == df2["id"], "inner").show()
+df1.join(df2, "id", "left_outer").show()  # Simplified syntax
+```
+### Schema Definition
+```python
+from pyspark.sql.types import StructType, StructField, StringType, IntegerType
+schema = StructType([
+    StructField("id", IntegerType(), True),
+    StructField("name", StringType(), True)
+])
+df = spark.createDataFrame(data, schema)
+```
+### Data Types
+- IntegerType, LongType, FloatType, DoubleType
+- StringType, BinaryType
+- BooleanType
+- DateType, TimestampType
+- ArrayType, MapType, StructType
+## REMOVED in v3.5
+**DEPRECATED APIs REMOVED**:
+- `DataFrame.inferSchema()` - Use `spark.read` with `inferSchema=True` instead
+- `SQLContext` - Use `SparkSession` instead
+- Legacy `RDD.toDF()` without schema - Now requires explicit schema
+- Old Window functions syntax - Use new SQL standard syntax
+**Breaking Changes**:
+- Python 2 support removed
+- Scala 2.11 support removed
+- Legacy Hive metastore APIs removed
+## NEW in v3.5
+**Performance Improvements**:
+- Adaptive Query Execution (AQE) enabled by default
+- Dynamic partition pruning enhancements
+- Better join reordering
+**New Features**:
+- Built-in ML preprocessing functions
+- Enhanced error messages with suggestions
+- Better compatibility with Pandas 2.0
+Version: v3.5
+Released: 2023-09
+Major Changes: Removed deprecated APIs, improved performance, Python 2 support dropped
+""",
+    # Healthcare
+    "clinical_guidelines_v1.0.txt": """# Clinical Treatment Guidelines v1.0
+## Introduction
+These guidelines provide evidence-based recommendations for patient care and treatment protocols.
+## General Treatment Protocols
+### Patient Assessment
+1. Initial examination and history taking
+2. Vital signs measurement
+3. Physical examination
+4. Laboratory tests as indicated
+5. Diagnostic imaging when necessary
+### Medication Administration
+- Follow five rights: right patient, right drug, right dose, right route, right time
+- Document all medications given
+- Monitor for adverse reactions
+- Patient education on medication use
+### Infection Control
+- Standard precautions for all patients
+- Hand hygiene before and after patient contact
+- Use of personal protective equipment (PPE)
+- Proper disposal of medical waste
+- Environmental cleaning protocols
+### Pain Management
+- Assess pain using standardized scales (0-10)
+- Non-pharmacological interventions first
+- Pharmacological options when indicated
+- Regular reassessment and documentation
+- Patient-controlled analgesia when appropriate
+### Common Conditions
+#### Hypertension
+- Target BP: <140/90 mmHg
+- First-line: ACE inhibitors or thiazide diuretics
+- Lifestyle modifications: diet, exercise, stress reduction
+- Regular monitoring and follow-up
+#### Diabetes Management
+- Target HbA1c: <7%
+- Blood glucose monitoring
+- Insulin or oral hypoglycemics as indicated
+- Dietary counseling
+- Regular foot examinations
+#### Respiratory Infections
+- Symptomatic treatment
+- Antibiotics only for bacterial infections
+- Rest and hydration
+- Isolation precautions if necessary
+### Documentation Requirements
+- All interventions must be documented
+- Adverse events reported immediately
+- Patient progress notes daily
+- Discharge planning initiated early
+Version: v1.0
+Effective Date: January 2023
+""",
+    "clinical_guidelines_v2.0.txt": """# Clinical Treatment Guidelines v2.0
+## Introduction
+These guidelines provide evidence-based recommendations for patient care and treatment protocols.
+**UPDATED for v2.0**: Incorporates latest research findings and new treatment modalities.
+## General Treatment Protocols
+### Patient Assessment
+1. Initial examination and history taking
+2. Vital signs measurement (now includes SpO2 monitoring)
+3. Physical examination
+4. Laboratory tests as indicated
+5. Diagnostic imaging when necessary
+6. **NEW**: Risk stratification scoring
+### Medication Administration
+- Follow five rights: right patient, right drug, right dose, right route, right time
+- Document all medications given
+- Monitor for adverse reactions
+- Patient education on medication use
+- **NEW**: Electronic verification system required
+- **NEW**: Double-check protocol for high-risk medications
+### Infection Control
+- Standard precautions for all patients
+- Hand hygiene before and after patient contact
+- Use of personal protective equipment (PPE)
+- Proper disposal of medical waste
+- Environmental cleaning protocols
+- **NEW**: Enhanced protocols for multi-drug resistant organisms
+- **NEW**: Mandatory staff screening during outbreaks
+### Pain Management
+- Assess pain using standardized scales (0-10)
+- Non-pharmacological interventions first
+- Pharmacological options when indicated
+- Regular reassessment and documentation
+- Patient-controlled analgesia when appropriate
+- **NEW**: Multimodal analgesia approach preferred
+- **NEW**: Reduced opioid prescribing guidelines
+### Common Conditions
+#### Hypertension (UPDATED)
+- **NEW Target BP: <130/80 mmHg** (lowered from 140/90)
+- First-line: ACE inhibitors or thiazide diuretics
+- **NEW**: Consider combination therapy for BP >140/90
+- Lifestyle modifications: diet, exercise, stress reduction
+- Regular monitoring and follow-up
+- **NEW**: Home blood pressure monitoring encouraged
+#### Diabetes Management (UPDATED)
+- Target HbA1c: <7% (individualized for elderly: <8%)
+- Blood glucose monitoring
+- **NEW**: GLP-1 agonists as first-line for cardiovascular benefit
+- Insulin or oral hypoglycemics as indicated
+- Dietary counseling with registered dietitian
+- Regular foot examinations
+- **NEW**: Annual retinal screening mandatory
+- **NEW**: Cardiovascular risk assessment required
+#### Respiratory Infections
+- Symptomatic treatment
+- Antibiotics only for bacterial infections
+- Rest and hydration
+- Isolation precautions if necessary
+- **NEW**: Rapid PCR testing for influenza and COVID-19
+- **NEW**: Updated isolation protocols
+### NEW SECTION: Telemedicine Protocols
+- Video visit guidelines
+- Remote monitoring for chronic conditions
+- Digital prescription protocols
+- Documentation requirements for virtual care
+### Documentation Requirements
+- All interventions must be documented in EHR
+- Adverse events reported immediately (within 24 hours)
+- Patient progress notes daily
+- Discharge planning initiated within 24 hours
+- **NEW**: Quality metrics tracking required
+- **NEW**: Patient satisfaction surveys
+Version: v2.0
+Effective Date: January 2024
+Major Changes: Updated BP targets, new diabetes medications, enhanced infection control, telemedicine added
+""",
+    # Finance
+    "compliance_fy2023.txt": """# Financial Compliance Report FY2023
+## Regulatory Overview
+This document outlines the compliance requirements for financial reporting and operations for Fiscal Year 2023.
+## Key Regulations
+### SOX Compliance (Sarbanes-Oxley Act)
+- Section 302: CEO/CFO certification of financial statements
+- Section 404: Internal control assessment
+- Section 409: Real-time disclosure of material changes
+- Annual external audit required
+- Quarterly internal control testing
+### Anti-Money Laundering (AML)
+- Customer due diligence (CDD) required
+- Transaction monitoring systems operational
+- Suspicious Activity Reports (SARs) filed when appropriate
+- Employee training completed annually
+- Independent testing of AML program
+### Know Your Customer (KYC)
+- Identity verification for all new customers
+- Beneficial ownership identification
+- Enhanced due diligence for high-risk customers
+- Ongoing monitoring and updates
+- Documentation retention for 5 years
+### Data Privacy
+- GDPR compliance for EU customers
+- CCPA compliance for California residents
+- Data encryption at rest and in transit
+- Access controls and authentication
+- Breach notification procedures
+## Reporting Requirements
+### Financial Statements
+- Quarterly 10-Q filings
+- Annual 10-K filing
+- Earnings releases
+- Management Discussion & Analysis (MD&A)
+- Audited financial statements
+### Regulatory Filings
+- Form 13F for institutional investment managers
+- Form 4 for insider transactions
+- Schedule 13D/G for beneficial ownership
+- Form 8-K for material events
+### Internal Reports
+- Monthly management reports
+- Quarterly compliance certifications
+- Annual risk assessments
+- Internal audit findings
+- Board committee reports
+## Risk Management
+### Operational Risk
+- Business continuity planning
+- Disaster recovery testing
+- Vendor management oversight
+- Cybersecurity assessments
+- Insurance coverage review
+### Market Risk
+- Value at Risk (VaR) calculations
+- Stress testing scenarios
+- Concentration limits
+- Hedging strategies
+- Daily position monitoring
+### Credit Risk
+- Credit rating assessments
+- Exposure limits by counterparty
+- Collateral management
+- Provision for loan losses
+- Portfolio diversification
+## Compliance Metrics FY2023
+- Total regulatory filings: 48
+- Internal audits conducted: 12
+- Compliance training completion: 98%
+- Zero material violations
+- External audit: Clean opinion
+Fiscal Year: 2023
+Report Date: December 2023
+""",
+    "compliance_fy2024.txt": """# Financial Compliance Report FY2024
+## Regulatory Overview
+This document outlines the compliance requirements for financial reporting and operations for Fiscal Year 2024.
+**MAJOR UPDATES for FY2024**: New SEC rules, enhanced cybersecurity requirements, and ESG disclosures.
+## Key Regulations
+### SOX Compliance (Sarbanes-Oxley Act)
+- Section 302: CEO/CFO certification of financial statements
+- Section 404: Internal control assessment
+- Section 409: Real-time disclosure of material changes
+- Annual external audit required
+- Quarterly internal control testing
+- **NEW**: Enhanced documentation requirements
+### Anti-Money Laundering (AML)
+- Customer due diligence (CDD) required
+- Transaction monitoring systems operational
+- Suspicious Activity Reports (SARs) filed when appropriate
+- Employee training completed annually
+- Independent testing of AML program
+- **NEW**: Real-time transaction monitoring enhanced
+- **NEW**: Cryptocurrency transaction monitoring added
+### Know Your Customer (KYC)
+- Identity verification for all new customers
+- Beneficial ownership identification
+- Enhanced due diligence for high-risk customers
+- Ongoing monitoring and updates
+- Documentation retention for 5 years
+- **NEW**: Biometric verification for high-risk accounts
+- **NEW**: Automated screening against sanctions lists
+### Data Privacy (UPDATED)
+- GDPR compliance for EU customers
+- CCPA compliance for California residents
+- **NEW**: CPRA (California Privacy Rights Act) requirements
+- Data encryption at rest and in transit
+- Access controls and multi-factor authentication
+- Breach notification procedures
+- **NEW**: Data mapping and inventory required
+- **NEW**: Privacy impact assessments for new systems
+### NEW: Cybersecurity Disclosure Rules
+- **MAJOR ADDITION**: SEC cybersecurity disclosure requirements
+- Material cybersecurity incidents reported within 4 days
+- Annual cybersecurity governance disclosure
+- Board oversight of cybersecurity risk
+- Incident response plan documented and tested
+### NEW: ESG Disclosure Requirements
+- **MAJOR ADDITION**: Climate-related disclosure rules
+- Scope 1 and 2 emissions reporting
+- Material climate risks identified
+- Board oversight of climate risks
+- Third-party assurance of emissions data
+## Reporting Requirements
+### Financial Statements
+- Quarterly 10-Q filings
+- Annual 10-K filing
+- Earnings releases
+- Management Discussion & Analysis (MD&A)
+- Audited financial statements
+- **NEW**: Inline XBRL tagging required
+### Regulatory Filings
+- Form 13F for institutional investment managers
+- Form 4 for insider transactions
+- Schedule 13D/G for beneficial ownership
+- Form 8-K for material events
+- **NEW**: Form 8-K for cybersecurity incidents
+- **NEW**: Climate disclosure forms
+### Internal Reports
+- Monthly management reports
+- Quarterly compliance certifications
+- Annual risk assessments
+- Internal audit findings
+- Board committee reports
+- **NEW**: Monthly cybersecurity dashboards
+- **NEW**: Quarterly ESG metrics
+## Risk Management
+### Operational Risk
+- Business continuity planning
+- Disaster recovery testing (now quarterly)
+- Vendor management oversight with annual reviews
+- **NEW**: Third-party risk assessment enhanced
+- Cybersecurity assessments (now monthly)
+- Insurance coverage review
+- **NEW**: Ransomware response protocols
+### Market Risk
+- Value at Risk (VaR) calculations
+- Stress testing scenarios (now includes crypto)
+- Concentration limits
+- Hedging strategies
+- Daily position monitoring
+- **NEW**: Climate scenario analysis
+### Credit Risk
+- Credit rating assessments
+- Exposure limits by counterparty
+- Collateral management
+- Provision for loan losses (CECL methodology)
+- Portfolio diversification
+- **NEW**: ESG factors in credit analysis
+### NEW: Cybersecurity Risk
+- Penetration testing quarterly
+- Vulnerability assessments monthly
+- Security awareness training for all employees
+- Incident response plan tested annually
+- 24/7 security operations center
+- Zero-trust architecture implementation
+## Compliance Metrics FY2024
+- Total regulatory filings: 56 (↑17% from FY2023)
+- Internal audits conducted: 16 (↑33%)
+- Compliance training completion: 99.5%
+- Zero material violations
+- External audit: Clean opinion
+- **NEW**: Cybersecurity incidents reported: 0
+- **NEW**: ESG disclosure score: A-
+Fiscal Year: 2024
+Report Date: December 2024
+Major Changes: New SEC cybersecurity rules, ESG disclosures added, enhanced AML monitoring, CPRA compliance
+""",
+    # Industrial
+    "machine_operation_rev1.0.txt": """# Industrial Machine Operation Manual - Rev. 1.0
+## Equipment Overview
+High-precision CNC milling machine for metal fabrication operations.
+Model: IMM-5000
+Serial Number: [Unit Specific]
+Manufacturer: Industrial Machines Inc.
+## Safety Requirements
+### Personal Protective Equipment (PPE)
+- Safety glasses with side shields (ANSI Z87.1)
+- Steel-toed safety boots
+- Hearing protection (>85 dB areas)
+- Machine operator gloves
+- No loose clothing or jewelry
+### Machine Safety Features
+- Emergency stop button (red mushroom head)
+- Safety interlocks on all access doors
+- Light curtain protection system
+- Audible alarm before operation
+- Fire suppression system
+## Startup Procedure
+### Pre-Startup Checks
+1. Inspect machine for visible damage or wear
+2. Check all safety guards are in place
+3. Verify emergency stop functions properly
+4. Ensure work area is clean and clear
+5. Check coolant levels (minimum 80%)
+6. Inspect cutting tools for wear or damage
+7. Verify power supply voltage (480V 3-phase)
+### Startup Sequence
+1. Turn main power switch to ON position
+2. Wait for hydraulic system to pressurize (indicator light)
+3. Initialize machine control system (press INIT button)
+4. Perform axis homing sequence (X, Y, Z axes)
+5. Load machining program into controller
+6. Verify tool offset data
+7. Perform dry run without material
+8. Load workpiece and secure in fixture
+9. Set spindle speed and feed rate
+10. Begin machining operation
+## Operation
+### Standard Operating Parameters
+- Spindle speed range: 100-6000 RPM
+- Feed rate: 1-500 inches per minute
+- Maximum workpiece weight: 2000 lbs
+- Coolant flow rate: 10 GPM
+- Operating temperature: 60-90°F
+### Control Panel Functions
+- CYCLE START: Begins programmed operation
+- CYCLE STOP: Pauses operation
+- EMERGENCY STOP: Immediate shutdown
+- FEED HOLD: Temporarily pauses feed motion
+- JOG: Manual axis movement
+- SPINDLE OVERRIDE: Adjust spindle speed (50-150%)
+### Monitoring During Operation
+- Watch for unusual vibrations
+- Listen for abnormal sounds
+- Monitor coolant flow
+- Check chip evacuation
+- Verify dimensional accuracy periodically
+- Monitor cutting tool wear
+## Shutdown Procedure
+1. Complete current machining cycle
+2. Press CYCLE STOP button
+3. Return spindle to home position
+4. Stop spindle rotation
+5. Turn off coolant system
+6. Remove workpiece
+7. Clean machine surfaces and work area
+8. Shut down control system
+9. Turn off main power switch
+10. Complete operator log entry
+## Maintenance Schedule
+### Daily
+- Clean machine surfaces
+- Check coolant level and condition
+- Inspect cutting tools
+- Verify all safety features
+- Lubricate way surfaces
+### Weekly
+- Check hydraulic fluid level
+- Inspect electrical connections
+- Test emergency stop function
+- Clean coolant tank filter
+### Monthly
+- Full machine cleaning
+- Lubrication of all grease points
+- Check belt tensions
+- Calibrate tools
+- Inspect safety guards
+### Annual
+- Professional maintenance service
+- Complete electrical inspection
+- Hydraulic system service
+- Accuracy verification
+- Safety system certification
+## Troubleshooting
+### Machine Won't Start
+- Check main power supply
+- Verify emergency stop is reset
+- Check for blown fuses
+- Inspect door interlocks
+### Poor Surface Finish
+- Check cutting tool condition
+- Verify proper speeds and feeds
+- Check machine rigidity
+- Inspect coolant flow
+### Dimensional Inaccuracy
+- Verify tool offsets
+- Check for thermal growth
+- Inspect ball screws
+- Verify workpiece fixturing
+Revision: 1.0
+Date: January 2023
+""",
+    "machine_operation_rev2.0.txt": """# Industrial Machine Operation Manual - Rev. 2.0
+## Equipment Overview
+High-precision CNC milling machine for metal fabrication operations.
+Model: IMM-5000
+Serial Number: [Unit Specific]
+Manufacturer: Industrial Machines Inc.
+**UPDATED Rev. 2.0**: Enhanced safety features, automated monitoring, and improved procedures.
+## Safety Requirements
+### Personal Protective Equipment (PPE)
+- Safety glasses with side shields (ANSI Z87.1)
+- Steel-toed safety boots
+- Hearing protection (>85 dB areas)
+- Machine operator gloves
+- No loose clothing or jewelry
+- **NEW**: Cut-resistant sleeves for tool changing
+### Machine Safety Features
+- Emergency stop button (red mushroom head)
+- Safety interlocks on all access doors
+- Light curtain protection system
+- Audible alarm before operation
+- Fire suppression system
+- **NEW**: Automatic door locking during operation
+- **NEW**: Collision detection system
+- **NEW**: Automatic power-off on anomaly detection
+- **NEW**: Video monitoring system
+- **NEW**: Operator presence detection
+### NEW: Enhanced Safety Protocols
+- **ADDITION**: Two-person operation required for large workpieces
+- **ADDITION**: Mandatory safety briefing before first daily use
+- **ADDITION**: Personal lockout/tagout procedures
+- **ADDITION**: Near-miss reporting system
+- **ADDITION**: Monthly safety drills
+## Startup Procedure
+### Pre-Startup Checks
+1. Inspect machine for visible damage or wear
+2. Check all safety guards are in place
+3. Verify emergency stop functions properly
+4. Ensure work area is clean and clear (5S standards)
+5. Check coolant levels (minimum 80%)
+6. Inspect cutting tools for wear or damage
+7. Verify power supply voltage (480V 3-phase)
+8. **NEW**: Complete digital pre-start checklist on HMI
+9. **NEW**: Verify backup systems operational
+10. **NEW**: Check air pressure (90 PSI minimum)
+### Startup Sequence
+1. Turn main power switch to ON position
+2. Wait for hydraulic system to pressurize (indicator light)
+3. Initialize machine control system (press INIT button)
+4. Perform axis homing sequence (X, Y, Z axes)
+5. **NEW**: System automatically runs diagnostics
+6. Load machining program into controller
+7. Verify tool offset data
+8. **NEW**: Automatic tool measurement cycle
+9. Perform dry run without material
+10. Load workpiece and secure in fixture
+11. **NEW**: Scan operator badge for authorization
+12. Set spindle speed and feed rate
+13. **NEW**: System verifies parameters within safe limits
+14. Begin machining operation
+## Operation
+### Standard Operating Parameters
+- Spindle speed range: 100-8000 RPM (↑ from 6000)
+- Feed rate: 1-500 inches per minute
+- Maximum workpiece weight: 2000 lbs
+- Coolant flow rate: 10 GPM
+- Operating temperature: 60-90°F
+- **NEW**: Automatic parameter optimization based on material
+- **NEW**: Real-time monitoring and adjustment
+### Control Panel Functions
+- CYCLE START: Begins programmed operation
+- CYCLE STOP: Pauses operation
+- EMERGENCY STOP: Immediate shutdown
+- FEED HOLD: Temporarily pauses feed motion
+- JOG: Manual axis movement
+- SPINDLE OVERRIDE: Adjust spindle speed (50-150%)
+- **NEW**: ADAPTIVE CONTROL: Auto-optimizes feeds/speeds
+- **NEW**: REMOTE MONITORING: View status on mobile app
+### Monitoring During Operation
+- Watch for unusual vibrations
+- Listen for abnormal sounds
+- Monitor coolant flow
+- Check chip evacuation
+- Verify dimensional accuracy periodically
+- Monitor cutting tool wear
+- **NEW**: Automated vibration monitoring alerts operator
+- **NEW**: Tool wear prediction system
+- **NEW**: Automatic quality checks every 10 parts
+- **NEW**: Energy consumption tracking
+### NEW: Automated Features
+- Automatic tool changer with 40-tool capacity
+- In-cycle tool measurement
+- Adaptive feed control
+- Predictive maintenance alerts
+- Remote diagnostics capability
+- Automatic program backup
+- Production counter with yield tracking
+## Shutdown Procedure
+1. Complete current machining cycle
+2. Press CYCLE STOP button
+3. **NEW**: Allow automatic cooldown cycle (2 minutes)
+4. Return spindle to home position
+5. Stop spindle rotation
+6. Turn off coolant system
+7. **NEW**: System automatically drains coolant from spindle
+8. Remove workpiece
+9. Clean machine surfaces and work area
+10. **NEW**: Complete digital operator log on HMI
+11. Shut down control system
+12. Turn off main power switch
+13. **NEW**: System generates daily production report
+## Maintenance Schedule
+### Daily
+- Clean machine surfaces
+- Check coolant level and condition
+- Inspect cutting tools
+- Verify all safety features
+- Lubricate way surfaces
+- **NEW**: Review automated diagnostic report
+- **NEW**: Check chip conveyor operation
+### Weekly
+- Check hydraulic fluid level
+- Inspect electrical connections
+- Test emergency stop function
+- Clean coolant tank filter
+- **NEW**: Review vibration analysis data
+- **NEW**: Update tool life database
+### Monthly
+- Full machine cleaning
+- Lubrication of all grease points
+- Check belt tensions
+- Calibrate tools
+- Inspect safety guards
+- **NEW**: Thermal imaging inspection
+- **NEW**: Backup all programs and parameters
+### Quarterly (NEW)
+- Professional calibration service
+- Update control software
+- Test all safety interlocks
+- Inspect for wear on critical components
+- Review maintenance logs
+### Annual
+- Professional maintenance service
+- Complete electrical inspection
+- Hydraulic system service
+- Accuracy verification (laser interferometer)
+- Safety system certification
+- **NEW**: Complete machine recalibration
+- **NEW**: Operator retraining and certification
+## Troubleshooting
+### Machine Won't Start
+- Check main power supply
+- Verify emergency stop is reset
+- Check for blown fuses
+- Inspect door interlocks
+- **NEW**: Review diagnostic error codes on HMI
+- **NEW**: Check operator authorization
+### Poor Surface Finish
+- Check cutting tool condition
+- Verify proper speeds and feeds
+- Check machine rigidity
+- Inspect coolant flow
+- **NEW**: Review vibration monitoring data
+- **NEW**: Check automatic compensation settings
+### Dimensional Inaccuracy
+- Verify tool offsets
+- Check for thermal growth
+- Inspect ball screws
+- Verify workpiece fixturing
+- **NEW**: Run automatic calibration routine
+- **NEW**: Check environmental temperature
+### NEW: Automated Diagnostics
+- System automatically logs errors
+- Predictive maintenance alerts
+- Remote support connection available
+- QR codes for instant technical manual access
+- Video troubleshooting guides on HMI
+## NEW SECTION: Industry 4.0 Integration
+- IoT connectivity for production monitoring
+- Integration with MES (Manufacturing Execution System)
+- Real-time OEE (Overall Equipment Effectiveness) tracking
+- Automatic inventory management of tools and consumables
+- Predictive maintenance using machine learning
+- Digital twin simulation capability
+Revision: 2.0
+Date: January 2024
+Major Changes: Enhanced safety features (collision detection, presence sensors), automated monitoring, predictive maintenance, Industry 4.0 connectivity, increased spindle speed range
+"""
+}
+def create_dataset():
+    """Create sample dataset directory and files"""
+    dataset_dir = "sample_data"
+    os.makedirs(dataset_dir, exist_ok=True)
+    print(f"Creating sample dataset in '{dataset_dir}' directory...")
+    for filename, content in SAMPLE_DOCS.items():
+        filepath = os.path.join(dataset_dir, filename)
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write(content)
+        print(f"✓ Created {filename}")
+    print(f"\n✅ Successfully created {len(SAMPLE_DOCS)} sample documents!")
+    print(f"\nDataset distribution:")
+    print("- Software (Node.js): 3 files")
+    print("- Software (Bootstrap): 2 files")
+    print("- Software (Spark): 2 files")
+    print("- Healthcare: 2 files")
+    print("- Finance: 2 files")
+    print("- Industrial: 2 files")
+    print("=" * 50)
+    print("Total: 13 documents covering 6 domains")
+if __name__ == "__main__":
+    create_dataset()

evaluation.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# evaluation.py - Evaluation System
+from typing import List, Dict, Tuple
+import time
+import numpy as np
+from dataclasses import dataclass
+import json
+from sklearn.metrics.pairwise import cosine_similarity
+@dataclass
+class Question:
+    """Represents a single evaluation question"""
+    query: str
+    query_type: str  # content_retrieval, version_inquiry, change_retrieval
+    expected_answer: str
+    expected_version: str
+    domain: str
+    topic: str
+    expected_keywords: List[str] = None
+class VersionQADataset:
+    """Dataset for evaluating version-aware QA"""
+    def __init__(self, questions: List[Question]):
+        self.questions = questions
+    @classmethod
+    def create_mini_versionqa(cls) -> 'VersionQADataset':
+        """Create the Mini-VersionQA dataset as specified"""
+        questions = [
+            # Software - Node.js Assert
+            Question(
+                query="What is the assert module in Node.js v20.0?",
+                query_type="content_retrieval",
+                expected_answer="assert module provides testing functions",
+                expected_version="v20.0",
+                domain="Software",
+                topic="Node.js Assert",
+                expected_keywords=["assert", "testing", "module"]
+            ),
+            Question(
+                query="List all versions of the assert module",
+                query_type="version_inquiry",
+                expected_answer="v20.0, v21.0, v23.0",
+                expected_version="all",
+                domain="Software",
+                topic="Node.js Assert",
+                expected_keywords=["v20.0", "v21.0", "v23.0"]
+            ),
+            Question(
+                query="When was the strict mode added to assert?",
+                query_type="change_retrieval",
+                expected_answer="v21.0",
+                expected_version="v21.0",
+                domain="Software",
+                topic="Node.js Assert",
+                expected_keywords=["strict", "mode", "v21.0"]
+            ),
+            # Software - Bootstrap
+            Question(
+                query="What are the grid classes in Bootstrap v5.2?",
+                query_type="content_retrieval",
+                expected_answer="col-*, row classes for responsive grid",
+                expected_version="v5.2",
+                domain="Software",
+                topic="Bootstrap",
+                expected_keywords=["grid", "col", "row"]
+            ),
+            Question(
+                query="What changed in Bootstrap from v5.2 to v5.3?",
+                query_type="change_retrieval",
+                expected_answer="new utility classes and improvements",
+                expected_version="v5.3",
+                domain="Software",
+                topic="Bootstrap",
+                expected_keywords=["utility", "classes", "v5.3"]
+            ),
+            # Software - Spark
+            Question(
+                query="How does DataFrame work in Spark v3.0?",
+                query_type="content_retrieval",
+                expected_answer="distributed collection of data organized into named columns",
+                expected_version="v3.0",
+                domain="Software",
+                topic="Spark",
+                expected_keywords=["dataframe", "distributed", "columns"]
+            ),
+            Question(
+                query="What was removed in Spark v3.5?",
+                query_type="change_retrieval",
+                expected_answer="deprecated APIs and legacy features",
+                expected_version="v3.5",
+                domain="Software",
+                topic="Spark",
+                expected_keywords=["removed", "deprecated", "v3.5"]
+            ),
+            # Healthcare
+            Question(
+                query="What are the treatment guidelines in v1.0?",
+                query_type="content_retrieval",
+                expected_answer="standard treatment protocols for patient care",
+                expected_version="v1.0",
+                domain="Healthcare",
+                topic="Clinical Guidelines",
+                expected_keywords=["treatment", "protocols", "guidelines"]
+            ),
+            Question(
+                query="What changed in clinical guidelines from v1.0 to v2.0?",
+                query_type="change_retrieval",
+                expected_answer="updated treatment protocols and new recommendations",
+                expected_version="v2.0",
+                domain="Healthcare",
+                topic="Clinical Guidelines",
+                expected_keywords=["updated", "protocols", "v2.0"]
+            ),
+            # Finance
+            Question(
+                query="What are the compliance requirements in FY2023?",
+                query_type="content_retrieval",
+                expected_answer="regulatory compliance requirements for financial reporting",
+                expected_version="FY2023",
+                domain="Finance",
+                topic="Compliance Reports",
+                expected_keywords=["compliance", "requirements", "regulatory"]
+            ),
+            Question(
+                query="What regulations changed from FY2023 to FY2024?",
+                query_type="change_retrieval",
+                expected_answer="new regulatory requirements and updated compliance standards",
+                expected_version="FY2024",
+                domain="Finance",
+                topic="Compliance Reports",
+                expected_keywords=["regulations", "changed", "FY2024"]
+            ),
+            # Industrial
+            Question(
+                query="What is the startup procedure in Rev. 1.0?",
+                query_type="content_retrieval",
+                expected_answer="machine startup steps and initialization procedures",
+                expected_version="Rev. 1.0",
+                domain="Industrial",
+                topic="Machine Operation",
+                expected_keywords=["startup", "procedure", "machine"]
+            ),
+            Question(
+                query="What safety features were added in Rev. 2.0?",
+                query_type="change_retrieval",
+                expected_answer="enhanced safety features and emergency protocols",
+                expected_version="Rev. 2.0",
+                domain="Industrial",
+                topic="Machine Operation",
+                expected_keywords=["safety", "features", "Rev. 2.0"]
+            ),
+        ]
+        return cls(questions)
+    @classmethod
+    def from_dict(cls, data: List[Dict]) -> 'VersionQADataset':
+        """Load dataset from dictionary"""
+        questions = []
+        for q in data:
+            questions.append(Question(
+                query=q['query'],
+                query_type=q['query_type'],
+                expected_answer=q['expected_answer'],
+                expected_version=q['expected_version'],
+                domain=q['domain'],
+                topic=q['topic'],
+                expected_keywords=q.get('expected_keywords', [])
+            ))
+        return cls(questions)
+    def to_dict(self) -> List[Dict]:
+        """Convert dataset to dictionary"""
+        return [
+            {
+                'query': q.query,
+                'query_type': q.query_type,
+                'expected_answer': q.expected_answer,
+                'expected_version': q.expected_version,
+                'domain': q.domain,
+                'topic': q.topic,
+                'expected_keywords': q.expected_keywords
+            }
+            for q in self.questions
+        ]
+class Evaluator:
+    """Evaluates VersionRAG and Baseline systems"""
+    def __init__(self, version_rag, baseline_rag):
+        self.version_rag = version_rag
+        self.baseline_rag = baseline_rag
+    def evaluate(self, dataset: VersionQADataset) -> Dict:
+        """Run full evaluation on dataset"""
+        versionrag_results = []
+        baseline_results = []
+        for question in dataset.questions:
+            # Evaluate VersionRAG
+            start_time = time.time()
+            try:
+                if question.query_type == "content_retrieval":
+                    vrag_answer = self.version_rag.query(
+                        query=question.query,
+                        version_filter=question.expected_version if question.expected_version != "all" else None
+                    )
+                elif question.query_type == "version_inquiry":
+                    vrag_answer = self.version_rag.version_inquiry(question.query)
+                else:  # change_retrieval
+                    vrag_answer = self.version_rag.change_retrieval(question.query)
+                vrag_latency = time.time() - start_time
+            except Exception as e:
+                print(f"VersionRAG error on '{question.query}': {e}")
+                vrag_answer = {'answer': '', 'sources': []}
+                vrag_latency = 0
+            # Evaluate Baseline
+            start_time = time.time()
+            try:
+                baseline_answer = self.baseline_rag.query(question.query)
+                baseline_latency = time.time() - start_time
+            except Exception as e:
+                print(f"Baseline error on '{question.query}': {e}")
+                baseline_answer = {'answer': '', 'sources': []}
+                baseline_latency = 0
+            # Score answers
+            vrag_score = self._score_answer(
+                vrag_answer.get('answer', ''),
+                question.expected_answer,
+                vrag_answer.get('sources', []),
+                question.expected_version,
+                question.expected_keywords
+            )
+            baseline_score = self._score_answer(
+                baseline_answer.get('answer', ''),
+                question.expected_answer,
+                baseline_answer.get('sources', []),
+                question.expected_version,
+                question.expected_keywords
+            )
+            versionrag_results.append({
+                'question': question,
+                'score': vrag_score,
+                'latency': vrag_latency,
+                'answer': vrag_answer.get('answer', '')
+            })
+            baseline_results.append({
+                'question': question,
+                'score': baseline_score,
+                'latency': baseline_latency,
+                'answer': baseline_answer.get('answer', '')
+            })
+        # Compute metrics
+        versionrag_metrics = self._compute_metrics(versionrag_results)
+        baseline_metrics = self._compute_metrics(baseline_results)
+        return {
+            'versionrag': versionrag_metrics,
+            'baseline': baseline_metrics,
+            'questions': len(dataset.questions),
+            'improvement': {
+                'accuracy': versionrag_metrics['accuracy'] - baseline_metrics['accuracy'],
+                'vsa': versionrag_metrics['vsa'] - baseline_metrics['vsa'],
+                'hit_at_5': versionrag_metrics['hit_at_5'] - baseline_metrics['hit_at_5']
+            }
+        }
+    def _score_answer(self, answer: str, expected: str, sources: List[Dict],
+                     expected_version: str, expected_keywords: List[str] = None) -> Dict:
+        """Score an answer based on correctness and version awareness"""
+        if not answer:
+            return {
+                'content_score': 0.0,
+                'version_score': 0.0,
+                'keyword_score': 0.0,
+                'total_score': 0.0
+            }
+        # Keyword-based content scoring
+        expected_keywords_set = set(expected.lower().split())
+        if expected_keywords:
+            expected_keywords_set.update([k.lower() for k in expected_keywords])
+        answer_keywords = set(answer.lower().split())
+        # Compute overlap
+        overlap = len(expected_keywords_set & answer_keywords)
+        keyword_score = min(overlap / max(len(expected_keywords_set), 1), 1.0)
+        # Semantic similarity (simple word overlap as proxy)
+        answer_words = answer.lower().split()
+        expected_words = expected.lower().split()
+        common_words = set(answer_words) & set(expected_words)
+        if len(expected_words) > 0:
+            content_score = len(common_words) / len(expected_words)
+        else:
+            content_score = 0.0
+        # Boost score if answer is longer and contains key terms
+        if len(answer) > 20 and keyword_score > 0.3:
+            content_score = min(content_score * 1.2, 1.0)
+        # Check version awareness
+        version_score = self._compute_version_score(sources, expected_version)
+        # Combined score
+        total_score = (content_score * 0.4 + version_score * 0.4 + keyword_score * 0.2)
+        return {
+            'content_score': content_score,
+            'version_score': version_score,
+            'keyword_score': keyword_score,
+            'total_score': total_score
+        }
+    def _compute_version_score(self, sources: List[Dict], expected_version: str) -> float:
+        """Compute version-awareness score"""
+        if expected_version == "all":
+            # For version inquiry, check if multiple versions are present
+            versions_in_sources = set()
+            for source in sources:
+                if isinstance(source, dict):
+                    version = source.get('version', 'N/A')
+                    if version != 'N/A':
+                        versions_in_sources.add(version)
+            # Score based on number of versions found (more is better)
+            return min(len(versions_in_sources) / 3.0, 1.0)
+        else:
+            # For specific version, check if expected version is in sources
+            for source in sources:
+                if isinstance(source, dict):
+                    version = source.get('version', '')
+                    if expected_version in str(version):
+                        return 1.0
+            return 0.0
+    def _compute_metrics(self, results: List[Dict]) -> Dict:
+        """Compute evaluation metrics"""
+        if not results:
+            return {
+                'accuracy': 0.0,
+                'hit_at_5': 0.0,
+                'mrr': 0.0,
+                'vsa': 0.0,
+                'avg_latency': 0.0,
+                'by_type': {
+                    'content_retrieval': 0.0,
+                    'version_inquiry': 0.0,
+                    'change_retrieval': 0.0
+                }
+            }
+        # Overall metrics
+        total_scores = [r['score']['total_score'] for r in results]
+        content_scores = [r['score']['content_score'] for r in results]
+        version_scores = [r['score']['version_score'] for r in results]
+        latencies = [r['latency'] for r in results]
+        # Hit@k (consider hit if score > 0.5)
+        hits = [1 if score > 0.5 else 0 for score in total_scores]
+        # MRR (Mean Reciprocal Rank)
+        # Assume rank 1 if score > 0.7, rank 2 if > 0.5, rank 3 if > 0.3, else rank 5
+        reciprocal_ranks = []
+        for score in total_scores:
+            if score > 0.7:
+                reciprocal_ranks.append(1.0)
+            elif score > 0.5:
+                reciprocal_ranks.append(1/2)
+            elif score > 0.3:
+                reciprocal_ranks.append(1/3)
+            else:
+                reciprocal_ranks.append(1/5)
+        # By query type
+        by_type = {
+            'content_retrieval': [],
+            'version_inquiry': [],
+            'change_retrieval': []
+        }
+        for result in results:
+            qtype = result['question'].query_type
+            by_type[qtype].append(result['score']['total_score'])
+        return {
+            'accuracy': np.mean(total_scores) * 100,
+            'hit_at_5': np.mean(hits) * 100,
+            'mrr': np.mean(reciprocal_ranks),
+            'vsa': np.mean(version_scores) * 100,  # Version-Sensitive Accuracy
+            'avg_latency': np.mean(latencies) if latencies else 0,
+            'by_type': {
+                'content_retrieval': np.mean(by_type['content_retrieval']) * 100 if by_type['content_retrieval'] else 0,
+                'version_inquiry': np.mean(by_type['version_inquiry']) * 100 if by_type['version_inquiry'] else 0,
+                'change_retrieval': np.mean(by_type['change_retrieval']) * 100 if by_type['change_retrieval'] else 0
+            }
+        }

graph_manager.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# graph_manager.py - Version Graph Management
+import networkx as nx
+from typing import List, Dict, Optional, Set
+import json
+from datetime import datetime
+import difflib
+class GraphManager:
+    """Manages version graph with documents, versions, and changes"""
+    def __init__(self, user_id: str):
+        self.user_id = user_id
+        self.graph = nx.DiGraph()
+        self.document_versions = {}  # document_name -> [versions]
+        self.version_content = {}  # (document, version) -> content
+    def add_document_version(self, document_name: str, version: str,
+                            content: str, metadata: Dict = None):
+        """Add a new version of a document to the graph"""
+        # Create document node if it doesn't exist
+        if document_name not in self.graph:
+            self.graph.add_node(document_name, node_type='document',
+                               metadata=metadata or {})
+            self.document_versions[document_name] = []
+        # Create version node
+        version_node = f"{document_name}:{version}"
+        self.graph.add_node(
+            version_node,
+            node_type='version',
+            version=version,
+            document=document_name,
+            timestamp=datetime.now().isoformat(),
+            metadata=metadata or {}
+        )
+        # Link document to version
+        self.graph.add_edge(document_name, version_node, edge_type='has_version')
+        # Store content
+        self.version_content[(document_name, version)] = content
+        # Add to version list
+        if version not in self.document_versions[document_name]:
+            self.document_versions[document_name].append(version)
+            self.document_versions[document_name].sort()
+        # Link to previous version if exists
+        versions = self.document_versions[document_name]
+        if len(versions) > 1:
+            prev_version = versions[versions.index(version) - 1]
+            prev_node = f"{document_name}:{prev_version}"
+            self.graph.add_edge(prev_node, version_node, edge_type='next_version')
+    def add_version_with_changes(self, document_name: str, version: str,
+                                changes: Dict):
+        """Add a version with explicit change tracking"""
+        version_node = f"{document_name}:{version}"
+        # Create change node
+        change_node = f"{version_node}:changes"
+        self.graph.add_node(
+            change_node,
+            node_type='changes',
+            additions=changes.get('additions', []),
+            deletions=changes.get('deletions', []),
+            modifications=changes.get('modifications', []),
+            timestamp=datetime.now().isoformat()
+        )
+        # Link version to changes
+        self.graph.add_edge(version_node, change_node, edge_type='has_changes')
+    def get_all_documents(self) -> List[str]:
+        """Get list of all documents"""
+        return [node for node, data in self.graph.nodes(data=True)
+                if data.get('node_type') == 'document']
+    def get_document_versions(self, document_name: str) -> List[str]:
+        """Get all versions of a document"""
+        return self.document_versions.get(document_name, [])
+    def get_version_info(self, document_name: str, version: str) -> Dict:
+        """Get information about a specific version"""
+        version_node = f"{document_name}:{version}"
+        if version_node in self.graph:
+            return self.graph.nodes[version_node]
+        return {}
+    def get_changes_between_versions(self, document_name: str,
+                                    version1: str, version2: str) -> Dict:
+        """Compute changes between two versions"""
+        content1 = self.version_content.get((document_name, version1), "")
+        content2 = self.version_content.get((document_name, version2), "")
+        if not content1 or not content2:
+            return {'additions': [], 'deletions': [], 'modifications': []}
+        # Compute diff
+        lines1 = content1.split('\n')
+        lines2 = content2.split('\n')
+        diff = difflib.unified_diff(lines1, lines2, lineterm='')
+        additions = []
+        deletions = []
+        modifications = []
+        for line in diff:
+            if line.startswith('+') and not line.startswith('+++'):
+                additions.append(line[1:])
+            elif line.startswith('-') and not line.startswith('---'):
+                deletions.append(line[1:])
+            elif line.startswith('?'):
+                modifications.append(line[1:])
+        return {
+            'additions': additions[:10],  # Limit for display
+            'deletions': deletions[:10],
+            'modifications': modifications[:10]
+        }
+    def query_version_graph(self, query: str) -> List[Dict]:
+        """Query the version graph for relevant versions"""
+        results = []
+        for node, data in self.graph.nodes(data=True):
+            if data.get('node_type') == 'version':
+                # Simple keyword matching (can be enhanced with embeddings)
+                if any(term.lower() in str(data).lower() for term in query.split()):
+                    results.append({
+                        'node': node,
+                        'data': data
+                    })
+        return results
+    def export_graph(self) -> Dict:
+        """Export graph structure"""
+        return {
+            'nodes': dict(self.graph.nodes(data=True)),
+            'edges': list(self.graph.edges(data=True)),
+            'document_versions': self.document_versions
+        }
+    def import_graph(self, graph_data: Dict):
+        """Import graph structure"""
+        self.graph = nx.DiGraph()
+        for node, data in graph_data['nodes'].items():
+            self.graph.add_node(node, **data)
+        for source, target, data in graph_data['edges']:
+            self.graph.add_edge(source, target, **data)
+        self.document_versions = graph_data.get('document_versions', {})

utils.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# utils.py - Utility Functions
+import PyPDF2
+import io
+import difflib
+from typing import List, Dict
+import hashlib
+import json
+import os
+from pathlib import Path
+class DocumentProcessor:
+    """Document processing utilities"""
+    @staticmethod
+    def extract_text_from_pdf(pdf_bytes: bytes) -> str:
+        """Extract text from PDF bytes"""
+        try:
+            pdf_file = io.BytesIO(pdf_bytes)
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+            text = ""
+            for page in pdf_reader.pages:
+                text += page.extract_text() + "\n"
+            return text
+        except Exception as e:
+            raise Exception(f"Error extracting PDF text: {str(e)}")
+    @staticmethod
+    def compute_hash(content: bytes) -> str:
+        """Compute SHA-256 hash of content"""
+        return hashlib.sha256(content).hexdigest()
+    @staticmethod
+    def chunk_text(text: str, chunk_size: int = 1000,
+                   overlap: int = 200) -> List[str]:
+        """Split text into overlapping chunks"""
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = start + chunk_size
+            chunk = text[start:end]
+            if chunk.strip():  # Only add non-empty chunks
+                chunks.append(chunk)
+            start = end - overlap
+        return chunks
+class ChangeDetector:
+    """Detect changes between document versions"""
+    @staticmethod
+    def compute_diff(old_text: str, new_text: str) -> Dict:
+        """Compute differences between two text versions"""
+        old_lines = old_text.split('\n')
+        new_lines = new_text.split('\n')
+        differ = difflib.Differ()
+        diff = list(differ.compare(old_lines, new_lines))
+        additions = []
+        deletions = []
+        modifications = []
+        for line in diff:
+            if line.startswith('+ '):
+                additions.append(line[2:])
+            elif line.startswith('- '):
+                deletions.append(line[2:])
+            elif line.startswith('? '):
+                modifications.append(line[2:])
+        return {
+            'additions': additions,
+            'deletions': deletions,
+            'modifications': modifications
+        }
+    @staticmethod
+    def semantic_change_detection(old_text: str, new_text: str,
+                                 embeddings) -> List[Dict]:
+        """Detect semantic changes using embeddings"""
+        old_chunks = DocumentProcessor.chunk_text(old_text)
+        new_chunks = DocumentProcessor.chunk_text(new_text)
+        try:
+            old_embeddings = embeddings.embed_documents(old_chunks)
+            new_embeddings = embeddings.embed_documents(new_chunks)
+            # This is a simplified version - can be enhanced with
+            # more sophisticated change detection algorithms
+            changes = []
+            return changes
+        except Exception as e:
+            print(f"Error in semantic change detection: {e}")
+            return []
+class PersistentStorage:
+    """Handle persistent storage of metadata"""
+    def __init__(self, user_id: str):
+        self.user_id = user_id
+        self.storage_dir = Path(f"./user_data_{user_id}")
+        self.storage_dir.mkdir(exist_ok=True)
+        self.metadata_file = self.storage_dir / "uploaded_files.json"
+    def save_metadata(self, metadata: Dict):
+        """Save uploaded files metadata"""
+        try:
+            with open(self.metadata_file, 'w') as f:
+                json.dump(metadata, f, indent=2)
+        except Exception as e:
+            print(f"Error saving metadata: {e}")
+    def load_metadata(self) -> Dict:
+        """Load uploaded files metadata"""
+        if self.metadata_file.exists():
+            try:
+                with open(self.metadata_file, 'r') as f:
+                    return json.load(f)
+            except Exception as e:
+                print(f"Error loading metadata: {e}")
+                return {}
+        return {}
+    def clear_metadata(self):
+        """Clear all metadata"""
+        if self.metadata_file.exists():
+            self.metadata_file.unlink()

version_rag.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# version_rag.py - Core VersionRAG Implementation (OpenAI Embeddings)
+import chromadb
+from chromadb.config import Settings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from typing import List, Dict, Optional
+import os
+from datetime import datetime
+import uuid
+class VersionRAG:
+    """Version-Aware RAG System with Graph + Vector Store"""
+    def __init__(self, user_id: str, model_name: str = "gpt-3.5-turbo",
+                 embedding_model: str = "text-embedding-3-small"):
+        self.user_id = user_id
+        self.model_name = model_name
+        # Initialize embeddings - Using OpenAI instead of sentence-transformers
+        self.embeddings = OpenAIEmbeddings(model=embedding_model)
+        # Initialize ChromaDB with persistence
+        persist_dir = f"./chroma_db_{user_id}"
+        os.makedirs(persist_dir, exist_ok=True)
+        self.chroma_client = chromadb.PersistentClient(path=persist_dir)
+        # Create collection with tenant metadata
+        collection_name = f"versionrag_{user_id}"
+        try:
+            self.collection = self.chroma_client.get_collection(name=collection_name)
+        except:
+            self.collection = self.chroma_client.create_collection(
+                name=collection_name,
+                metadata={"tenant_id": user_id}
+            )
+        # Initialize LLM
+        self.llm = ChatOpenAI(
+            model_name=model_name,
+            temperature=0
+        )
+        # Text splitter
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len
+        )
+        self.documents = []
+        self.metadatas = []
+        self.graph_manager = None
+    def set_graph_manager(self, graph_manager):
+        """Set graph manager for version tracking"""
+        self.graph_manager = graph_manager
+    def add_documents(self, texts: List[str], metadatas: List[Dict], changes: Optional[List[Dict]] = None):
+        """Add documents to the vector store with version metadata and changes"""
+        all_chunks = []
+        all_chunk_metadatas = []
+        all_ids = []
+        for idx, (text, metadata) in enumerate(zip(texts, metadatas)):
+            # Split text into chunks
+            chunks = self.text_splitter.split_text(text)
+            # Add tenant_id to metadata
+            for chunk_idx, chunk in enumerate(chunks):
+                chunk_metadata = metadata.copy()
+                chunk_metadata['tenant_id'] = self.user_id
+                chunk_metadata['chunk_id'] = len(all_chunks)
+                chunk_metadata['doc_type'] = 'content'
+                all_chunks.append(chunk)
+                all_chunk_metadatas.append(chunk_metadata)
+                all_ids.append(f"{self.user_id}_content_{uuid.uuid4()}")
+            # Add change information if provided
+            if changes and idx < len(changes) and changes[idx]:
+                change_info = changes[idx]
+                # Add additions as separate chunks
+                for addition in change_info.get('additions', [])[:20]:
+                    if len(addition.strip()) > 10:
+                        change_metadata = metadata.copy()
+                        change_metadata['tenant_id'] = self.user_id
+                        change_metadata['doc_type'] = 'change'
+                        change_metadata['change_type'] = 'addition'
+                        all_chunks.append(f"[ADDITION in {metadata.get('version')}] {addition}")
+                        all_chunk_metadatas.append(change_metadata)
+                        all_ids.append(f"{self.user_id}_change_{uuid.uuid4()}")
+                # Add deletions as separate chunks
+                for deletion in change_info.get('deletions', [])[:20]:
+                    if len(deletion.strip()) > 10:
+                        change_metadata = metadata.copy()
+                        change_metadata['tenant_id'] = self.user_id
+                        change_metadata['doc_type'] = 'change'
+                        change_metadata['change_type'] = 'deletion'
+                        all_chunks.append(f"[DELETION in {metadata.get('version')}] {deletion}")
+                        all_chunk_metadatas.append(change_metadata)
+                        all_ids.append(f"{self.user_id}_change_{uuid.uuid4()}")
+                # Add modifications as separate chunks
+                for modification in change_info.get('modifications', [])[:20]:
+                    if len(modification.strip()) > 10:
+                        change_metadata = metadata.copy()
+                        change_metadata['tenant_id'] = self.user_id
+                        change_metadata['doc_type'] = 'change'
+                        change_metadata['change_type'] = 'modification'
+                        all_chunks.append(f"[MODIFICATION in {metadata.get('version')}] {modification}")
+                        all_chunk_metadatas.append(change_metadata)
+                        all_ids.append(f"{self.user_id}_change_{uuid.uuid4()}")
+        # Add to ChromaDB
+        if all_chunks:
+            embeddings = self.embeddings.embed_documents(all_chunks)
+            self.collection.add(
+                embeddings=embeddings,
+                documents=all_chunks,
+                metadatas=all_chunk_metadatas,
+                ids=all_ids
+            )
+        self.documents.extend(all_chunks)
+        self.metadatas.extend(all_chunk_metadatas)
+    def query(self, query: str, version_filter: Optional[str] = None,
+             top_k: int = 5) -> Dict:
+        """Query with version awareness"""
+        # Embed query
+        query_embedding = self.embeddings.embed_query(query)
+        # Build where clause for filtering
+        if version_filter:
+            where = {
+                "$and": [
+                    {"tenant_id": self.user_id},
+                    {"doc_type": "content"},
+                    {"version": version_filter}
+                ]
+            }
+        else:
+            where = {
+                "$and": [
+                    {"tenant_id": self.user_id},
+                    {"doc_type": "content"}
+                ]
+            }
+        # Query ChromaDB
+        try:
+            results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=top_k,
+                where=where
+            )
+        except Exception as e:
+            return {
+                'answer': f"Error querying database: {str(e)}",
+                'sources': []
+            }
+        # Extract results
+        if not results['documents'][0]:
+            return {
+                'answer': "No relevant documents found.",
+                'sources': []
+            }
+        # Prepare context
+        context_docs = results['documents'][0]
+        context_metadatas = results['metadatas'][0]
+        distances = results['distances'][0]
+        # Build context string
+        context = "\n\n".join([
+            f"[Version {meta.get('version', 'N/A')} - {meta.get('topic', 'Unknown')}]\n{doc}"
+            for doc, meta in zip(context_docs, context_metadatas)
+        ])
+        # Generate answer using LLM
+        prompt = f"""Based on the following context, answer the question.
+If the answer includes version-specific information, explicitly mention the version.
+Be precise and cite the version when relevant.
+Context:
+{context}
+Question: {query}
+Answer:"""
+        try:
+            response = self.llm.invoke(prompt)
+            answer = response.content if hasattr(response, 'content') else str(response)
+        except Exception as e:
+            answer = f"Error generating answer: {str(e)}"
+        # Prepare sources
+        sources = []
+        for doc, meta, dist in zip(context_docs, context_metadatas, distances):
+            sources.append({
+                'content': doc,
+                'version': meta.get('version', 'N/A'),
+                'filename': meta.get('filename', 'N/A'),
+                'domain': meta.get('domain', 'N/A'),
+                'topic': meta.get('topic', 'N/A'),
+                'similarity': 1 - dist
+            })
+        return {
+            'answer': answer,
+            'sources': sources,
+            'context': context
+        }
+    def version_inquiry(self, query: str) -> Dict:
+        """Handle version-specific inquiries using graph"""
+        if self.graph_manager:
+            documents = self.graph_manager.get_all_documents()
+            relevant_docs = []
+            query_lower = query.lower()
+            for doc in documents:
+                if any(word in doc.lower() for word in query_lower.split()):
+                    relevant_docs.append(doc)
+            if relevant_docs:
+                answer = f"Found version information for {len(relevant_docs)} document(s):\n\n"
+                versions_found = []
+                for doc in relevant_docs:
+                    versions = self.graph_manager.get_document_versions(doc)
+                    versions_found.extend(versions)
+                    answer += f"**{doc}**\n"
+                    answer += f"- Versions: {', '.join(versions)}\n"
+                    for version in versions:
+                        info = self.graph_manager.get_version_info(doc, version)
+                        if info:
+                            answer += f"  - {version}: {info.get('timestamp', 'N/A')}\n"
+                    answer += "\n"
+                return {
+                    'answer': answer,
+                    'sources': [],
+                    'versions': list(set(versions_found))
+                }
+        # Fallback to vector search
+        query_embedding = self.embeddings.embed_query(query)
+        results = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=20,
+            where={
+                "$and": [
+                    {"tenant_id": self.user_id},
+                    {"doc_type": "content"}
+                ]
+            }
+        )
+        versions = set()
+        version_info = {}
+        for meta in results['metadatas'][0]:
+            version = meta.get('version', 'N/A')
+            if version != 'N/A':
+                versions.add(version)
+                if version not in version_info:
+                    version_info[version] = {
+                        'filename': meta.get('filename', 'N/A'),
+                        'domain': meta.get('domain', 'N/A'),
+                        'topic': meta.get('topic', 'N/A')
+                    }
+        version_list = ", ".join(sorted(versions))
+        answer = f"Found {len(versions)} version(s): {version_list}\n\n"
+        for version in sorted(versions):
+            info = version_info[version]
+            answer += f"- **{version}**: {info['topic']} ({info['domain']})\n"
+        return {
+            'answer': answer,
+            'sources': [],
+            'versions': list(versions)
+        }
+    def change_retrieval(self, query: str) -> Dict:
+        """Retrieve change information between versions"""
+        query_embedding = self.embeddings.embed_query(query)
+        try:
+            results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=10,
+                where={
+                    "$and": [
+                        {"tenant_id": self.user_id},
+                        {"doc_type": "change"}
+                    ]
+                }
+            )
+        except:
+            results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=10,
+                where={"tenant_id": self.user_id}
+            )
+        if results['documents'][0] and results['metadatas'][0]:
+            changes = []
+            for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
+                if meta.get('doc_type') == 'change':
+                    changes.append({
+                        'content': doc,
+                        'version': meta.get('version', 'N/A'),
+                        'change_type': meta.get('change_type', 'unknown'),
+                        'filename': meta.get('filename', 'N/A'),
+                        'topic': meta.get('topic', 'N/A')
+                    })
+            if changes:
+                answer = "Changes detected:\n\n"
+                for change in changes[:5]:
+                    answer += f"**[{change['version']} - {change['change_type'].upper()}]**\n"
+                    answer += f"Topic: {change['topic']}\n"
+                    answer += f"{change['content']}\n\n"
+                return {
+                    'answer': answer,
+                    'sources': changes
+                }
+        context_results = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=5,
+            where={"tenant_id": self.user_id}
+        )
+        if context_results['documents'][0]:
+            context = "\n\n".join(context_results['documents'][0])
+            prompt = f"""Based on the context, identify and describe any changes, additions, deletions, or modifications mentioned.
+Context:
+{context}
+Question: {query}
+Answer:"""
+            try:
+                response = self.llm.invoke(prompt)
+                answer = response.content if hasattr(response, 'content') else str(response)
+            except:
+                answer = "Unable to determine changes."
+        else:
+            answer = "No change information found."
+        return {
+            'answer': answer,
+            'sources': context_results['metadatas'][0][:5] if context_results['metadatas'][0] else []
+        }
+class BaselineRAG:
+    """Standard RAG system without version awareness"""
+    def __init__(self, user_id: str, model_name: str = "gpt-3.5-turbo",
+                 embedding_model: str = "text-embedding-3-small"):
+        self.user_id = user_id
+        self.model_name = model_name
+        # Initialize embeddings - Using OpenAI
+        self.embeddings = OpenAIEmbeddings(model=embedding_model)
+        persist_dir = f"./chroma_baseline_{user_id}"
+        os.makedirs(persist_dir, exist_ok=True)
+        self.chroma_client = chromadb.PersistentClient(path=persist_dir)
+        collection_name = f"baseline_{user_id}"
+        try:
+            self.collection = self.chroma_client.get_collection(name=collection_name)
+        except:
+            self.collection = self.chroma_client.create_collection(name=collection_name)
+        self.llm = ChatOpenAI(
+            model_name=model_name,
+            temperature=0
+        )
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200
+        )
+    def add_documents(self, texts: List[str], metadatas: List[Dict]):
+        """Add documents to vector store"""
+        all_chunks = []
+        all_metadatas = []
+        all_ids = []
+        for text, metadata in zip(texts, metadatas):
+            chunks = self.text_splitter.split_text(text)
+            for chunk in chunks:
+                all_chunks.append(chunk)
+                all_metadatas.append(metadata.copy())
+                all_ids.append(f"baseline_{self.user_id}_{uuid.uuid4()}")
+        if all_chunks:
+            embeddings = self.embeddings.embed_documents(all_chunks)
+            self.collection.add(
+                embeddings=embeddings,
+                documents=all_chunks,
+                metadatas=all_metadatas,
+                ids=all_ids
+            )
+    def query(self, query: str, top_k: int = 5) -> Dict:
+        """Standard query without version awareness"""
+        query_embedding = self.embeddings.embed_query(query)
+        try:
+            results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=top_k
+            )
+        except Exception as e:
+            return {
+                'answer': f"Error: {str(e)}",
+                'sources': []
+            }
+        if not results['documents'][0]:
+            return {
+                'answer': "No relevant documents found.",
+                'sources': []
+            }
+        context = "\n\n".join(results['documents'][0])
+        prompt = f"""Based on the following context, answer the question.
+Context:
+{context}
+Question: {query}
+Answer:"""
+        try:
+            response = self.llm.invoke(prompt)
+            answer = response.content if hasattr(response, 'content') else str(response)
+        except Exception as e:
+            answer = f"Error: {str(e)}"
+        sources = [
+            {'content': doc, 'metadata': meta}
+            for doc, meta in zip(results['documents'][0], results['metadatas'][0])
+        ]
+        return {
+            'answer': answer,
+            'sources': sources
+        }