Spaces:

Divs0910
/

Digi-Biz

Paused

File size: 50,527 Bytes

255cbd1

"""
Digi-Biz: Agentic Business Digitization Framework
Streamlit Demo Application

This app demonstrates the complete workflow:
1. Upload ZIP with business documents
2. File Discovery Agent extracts and classifies files
3. Document Parsing Agent extracts text and tables
4. Media Extraction Agent extracts images
5. Vision Agent (Groq Llama-4-Scout) analyzes images
6. View results
"""
import streamlit as st
import os
import tempfile
import shutil
from pathlib import Path
from datetime import datetime
import json
import io
from PIL import Image
from backend.utils.logger import get_logger

logger = get_logger(__name__)

# Load environment variables from .env file
from dotenv import load_dotenv
env_path = Path(__file__).parent / ".env"
if env_path.exists():
    load_dotenv(env_path)

# Import Groq to verify it's available
try:
    from groq import Groq
    GROQ_AVAILABLE = True
except ImportError:
    GROQ_AVAILABLE = False

# Import agents
from backend.agents.file_discovery import FileDiscoveryAgent, FileDiscoveryInput
from backend.agents.document_parsing import DocumentParsingAgent, DocumentParsingInput
from backend.agents.table_extraction import TableExtractionAgent, TableExtractionInput
from backend.agents.media_extraction import MediaExtractionAgent, MediaExtractionInput
from backend.agents.vision_agent import VisionAgent, VisionAnalysisInput
from backend.agents.indexing import IndexingAgent, IndexingInput
from backend.utils.storage_manager import StorageManager


# =============================================================================
# Streamlit Configuration
# =============================================================================
st.set_page_config(
    page_title="Digi-Biz - Business Digitization",
    page_icon="📄",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        font-weight: bold;
        color: #1E88E5;
        text-align: center;
        margin-bottom: 1rem;
    }
    .sub-header {
        font-size: 1.2rem;
        color: #666;
        text-align: center;
        margin-bottom: 2rem;
    }
    .success-box {
        padding: 1rem;
        border-radius: 0.5rem;
        background-color: #E8F5E9;
        border-left: 4px solid #4CAF50;
        margin: 1rem 0;
    }
    .info-box {
        padding: 1rem;
        border-radius: 0.5rem;
        background-color: #E3F2FD;
        border-left: 4px solid #2196F3;
        margin: 1rem 0;
    }
    .agent-card {
        padding: 1rem;
        border-radius: 0.5rem;
        background-color: #f5f5f5;
        margin: 0.5rem 0;
    }
</style>
""", unsafe_allow_html=True)


# =============================================================================
# Session State Initialization
# =============================================================================
if 'job_id' not in st.session_state:
    st.session_state.job_id = ""
if 'discovery_output' not in st.session_state:
    st.session_state.discovery_output = None
if 'parsing_output' not in st.session_state:
    st.session_state.parsing_output = None
if 'tables_output' not in st.session_state:
    st.session_state.tables_output = None
if 'media_output' not in st.session_state:
    st.session_state.media_output = None
if 'vision_output' not in st.session_state:
    st.session_state.vision_output = None
if 'processing_complete' not in st.session_state:
    st.session_state.processing_complete = False


# =============================================================================
# Helper Functions
# =============================================================================
def generate_job_id():
    """Generate unique job ID"""
    return f"job_{datetime.now().strftime('%Y%m%d_%H%M%S')}"


def cleanup_temp_dirs():
    """Clean up temporary directories"""
    temp_base = Path(tempfile.gettempdir()) / "digi_biz"
    if temp_base.exists():
        shutil.rmtree(temp_base)


def get_model_status():
    """Check if Ollama and Qwen model are available"""
    try:
        from ollama import Client
        client = Client(host='http://localhost:11434', timeout=5)
        response = client.list()
        
        if isinstance(response, dict) and 'models' in response:
            models = [m['name'] for m in response['models']]
        elif hasattr(response, 'models'):
            models = [m.name if hasattr(m, 'name') else m['name'] for m in response.models]
        else:
            models = []
        
        ollama_ok = True
        qwen_available = any('qwen3.5' in m for m in models)
        
        # Test actual vision capability
        vision_working = False
        if qwen_available:
            try:
                # Quick vision test
                test_client = Client(host='http://localhost:11434', timeout=30)
                test_img = Image.new('RGB', (50, 50), color='red')
                test_bytes = io.BytesIO()
                test_img.save(test_bytes, format='PNG')
                
                test_response = test_client.chat(
                    model='qwen3.5:0.8b',
                    messages=[{
                        'role': 'user',
                        'content': 'What color?',
                        'images': [test_bytes.getvalue()]
                    }],
                    options={'timeout': 20000}
                )
                
                vision_working = len(test_response['message']['content'].strip()) > 10
            except Exception:
                vision_working = False
        
        return ollama_ok, qwen_available, vision_working, models
        
    except Exception:
        return False, False, False, []


# =============================================================================
# Main App
# =============================================================================

# Header
st.markdown('<h1 class="main-header">📄 Digi-Biz</h1>', unsafe_allow_html=True)
st.markdown('<p class="sub-header">Agentic Business Digitization Framework</p>', unsafe_allow_html=True)

# Sidebar
with st.sidebar:
    st.header("🔧 Configuration")
    
    # Model status
    st.subheader("Model Status")
    
    # Check Groq API
    groq_ok = False
    groq_model = "N/A"
    groq_error = ""
    
    try:
        api_key = os.getenv("GROQ_API_KEY")
        
        if not api_key:
            groq_error = "GROQ_API_KEY not set in .env"
        elif api_key == "gsk_YOUR_API_KEY_HERE":
            groq_error = "Using placeholder key"
        else:
            # Try to create client
            client = Groq(api_key=api_key, timeout=5)
            models = client.models.list()
            groq_ok = True
            groq_model = "llama-4-scout-17b"
    except ImportError:
        groq_error = "groq package not installed"
    except Exception as e:
        groq_error = str(e)[:100]
    
    if groq_ok:
        st.success(f"✓ Groq API: {groq_model}")
    else:
        st.error("✗ Groq API Not Available")
        st.code(groq_error, language=None)
        st.info("Fix: Get key from https://console.groq.com and add to .env file")
    
    # Check Ollama (fallback)
    ollama_ok = False
    try:
        from ollama import Client
        client = Client(host='http://localhost:11434', timeout=5)
        client.list()
        ollama_ok = True
    except Exception:
        pass
    
    if ollama_ok:
        st.success("✓ Ollama: Fallback Ready")
    else:
        st.warning("⚠ Ollama: Not Running (optional)")
    
    st.divider()
    
    # Agent status
    st.subheader("Agents")
    st.markdown("""
    <div class="agent-card">
    <b>1. File Discovery</b><br>
    <small>Extracts & classifies files from ZIP</small>
    </div>
    
    <div class="agent-card">
    <b>2. Document Parsing</b><br>
    <small>Extracts text from PDF/DOCX</small>
    </div>
    
    <div class="agent-card">
    <b>3. Table Extraction</b><br>
    <small>Detects & classifies tables</small>
    </div>
    
    <div class="agent-card">
    <b>4. Media Extraction</b><br>
    <small>Extracts embedded images</small>
    </div>
    
    <div class="agent-card">
    <b>5. Vision Agent</b><br>
    <small>Analyzes images with Groq</small>
    </div>
    
    <div class="agent-card">
    <b>6. Indexing Agent</b><br>
    <small>Builds RAG search index</small>
    </div>
    """, unsafe_allow_html=True)
    
    st.divider()
    
    # Reset button
    if st.button("🔄 Reset All", use_container_width=True):
        cleanup_temp_dirs()
        for key in list(st.session_state.keys()):
            st.session_state[key] = None
        st.session_state.processing_complete = False
        st.rerun()

# Main content area
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["📤 Upload", "⚙️ Processing", "📊 Results", "🖼️ Vision Analysis", "🌳 Index Tree", "📄 Business Profile"])

with tab1:
    st.header("Upload Business Documents")
    
    st.markdown("""
    **Supported Formats:**
    - 📄 Documents: PDF, DOCX, DOC
    - 📊 Spreadsheets: XLSX, XLS, CSV
    - 🖼️ Images: JPG, PNG, GIF, WEBP
    - 🎥 Videos: MP4, AVI, MOV
    
    **Instructions:**
    1. Create a ZIP file with your business documents
    2. Upload using the file uploader below
    3. Click "Start Processing"
    """)
    
    uploaded_file = st.file_uploader(
        "Upload ZIP file",
        type=['zip'],
        help="Select a ZIP file containing business documents"
    )
    
    if uploaded_file:
        st.success(f"✓ Uploaded: {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
        
        # Save to temp location
        temp_dir = Path(tempfile.gettempdir()) / "digi_biz" / generate_job_id()
        temp_dir.mkdir(parents=True, exist_ok=True)
        
        zip_path = temp_dir / uploaded_file.name
        with open(zip_path, 'wb') as f:
            f.write(uploaded_file.getvalue())
        
        st.session_state.zip_path = str(zip_path)
        st.session_state.job_id = temp_dir.name
        
        st.info(f"Job ID: `{st.session_state.job_id}`")
        
        # Start processing button
        if st.button("🚀 Start Processing", type="primary", use_container_width=True):
            st.session_state.processing_started = True
            st.rerun()

with tab2:
    st.header("Processing Pipeline")
    
    if not hasattr(st.session_state, 'processing_started') or not st.session_state.processing_started:
        st.info("👆 Upload a ZIP file and click 'Start Processing'")
        st.stop()
    
    progress_bar = st.progress(0)
    status_text = st.empty()
    
    # Step 1: File Discovery
    status_text.text("Step 1/5: File Discovery Agent...")
    try:
        storage_manager = StorageManager(storage_base=str(Path(tempfile.gettempdir()) / "digi_biz" / st.session_state.job_id))
        
        discovery_agent = FileDiscoveryAgent(storage_manager=storage_manager)
        discovery_input = FileDiscoveryInput(
            zip_file_path=st.session_state.zip_path,
            job_id=st.session_state.job_id
        )
        st.session_state.discovery_output = discovery_agent.discover(discovery_input)
        
        progress_bar.progress(20)
        
        if st.session_state.discovery_output.success:
            st.success(f"✓ File Discovery Complete: {st.session_state.discovery_output.total_files} files")
            st.markdown(f"""
            <div class="success-box">
            <b>Summary:</b><br>
            • Documents: {st.session_state.discovery_output.summary.get('documents_count', 0)}<br>
            • Spreadsheets: {st.session_state.discovery_output.summary.get('spreadsheets_count', 0)}<br>
            • Images: {st.session_state.discovery_output.summary.get('images_count', 0)}<br>
            • Videos: {st.session_state.discovery_output.summary.get('videos_count', 0)}
            </div>
            """, unsafe_allow_html=True)
        else:
            st.error(f"✗ File Discovery Failed: {st.session_state.discovery_output.errors}")
            st.stop()
            
    except Exception as e:
        st.error(f"File Discovery Error: {str(e)}")
        st.stop()
    
    # Step 2: Document Parsing
    status_text.text("Step 2/5: Document Parsing Agent...")
    try:
        parsing_agent = DocumentParsingAgent(enable_ocr=False)
        parsing_input = DocumentParsingInput(
            documents=st.session_state.discovery_output.documents,
            job_id=st.session_state.job_id,
            enable_ocr=False
        )
        st.session_state.parsing_output = parsing_agent.parse(parsing_input)
        
        progress_bar.progress(40)
        
        if st.session_state.parsing_output.success:
            st.success(f"✓ Document Parsing Complete: {st.session_state.parsing_output.total_pages} pages")
        else:
            st.warning("⚠ Document Parsing: No documents to parse")
            
    except Exception as e:
        st.warning(f"Document Parsing: {str(e)}")
    
    # Step 3: Table Extraction
    status_text.text("Step 3/5: Table Extraction Agent...")
    try:
        table_agent = TableExtractionAgent()
        table_input = TableExtractionInput(
            parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [],
            job_id=st.session_state.job_id
        )
        st.session_state.tables_output = table_agent.extract(table_input)
        
        progress_bar.progress(60)
        
        if st.session_state.tables_output.success:
            st.success(f"✓ Table Extraction Complete: {st.session_state.tables_output.total_tables} tables")
            if st.session_state.tables_output.tables_by_type:
                types_str = ", ".join([f"{k}: {v}" for k, v in st.session_state.tables_output.tables_by_type.items()])
                st.info(f"Types: {types_str}")
        else:
            st.warning("⚠ Table Extraction: No tables found")
            
    except Exception as e:
        st.warning(f"Table Extraction: {str(e)}")
    
    # Step 4: Media Extraction
    status_text.text("Step 4/5: Media Extraction Agent...")
    try:
        media_agent = MediaExtractionAgent(enable_deduplication=True)
        media_input = MediaExtractionInput(
            parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [],
            standalone_files=[img.file_path for img in st.session_state.discovery_output.images] if st.session_state.discovery_output else [],
            job_id=st.session_state.job_id
        )
        st.session_state.media_output = media_agent.extract_all(media_input)
        
        progress_bar.progress(80)
        
        if st.session_state.media_output.success:
            st.success(f"✓ Media Extraction Complete: {st.session_state.media_output.total_images} images")
            if st.session_state.media_output.duplicates_removed > 0:
                st.info(f"Removed {st.session_state.media_output.duplicates_removed} duplicates")
        else:
            st.warning("⚠ Media Extraction: No images found")
            
    except Exception as e:
        st.warning(f"Media Extraction: {str(e)}")
    
    # Step 5: Vision Analysis
    status_text.text("Step 5/5: Vision Agent (Groq Llama-4-Scout)...")
    try:
        # Initialize Vision Agent with Groq provider
        from backend.agents.vision_agent import VisionAgent
        
        vision_agent = VisionAgent(provider="groq", timeout=120)
        
        # Check if we have images to analyze
        images_to_analyze = []
        if st.session_state.media_output and st.session_state.media_output.success:
            images_to_analyze = st.session_state.media_output.media.images[:5]  # Analyze first 5 images

        if images_to_analyze:
            st.info(f"Analyzing {len(images_to_analyze)} images with Groq Vision (Llama-4-Scout)...")
            progress_vision = st.progress(0)

            try:
                # Analyze images
                analyses = vision_agent.analyze_batch(images_to_analyze)
                st.session_state.vision_output = analyses

                progress_vision.progress(100)
                st.success(f"✓ Vision Analysis Complete: {len(analyses)} images analyzed")

                # Show quick summary
                if analyses:
                    categories = {}
                    for a in analyses:
                        cat = a.category.value
                        categories[cat] = categories.get(cat, 0) + 1

                    st.markdown("**Categories Detected:**")
                    cat_text = ", ".join([f"{k}: {v}" for k, v in categories.items()])
                    st.info(cat_text)

            except Exception as ve:
                st.warning(f"Vision analysis failed: {str(ve)}")
                st.info("Falling back to Ollama...")

                # Try Ollama fallback
                try:
                    vision_agent_ollama = VisionAgent(provider="ollama", timeout=120)
                    analyses = vision_agent_ollama.analyze_batch(images_to_analyze)
                    st.session_state.vision_output = analyses
                    st.success(f"✓ Vision Analysis Complete (via Ollama): {len(analyses)} images")
                except Exception as e2:
                    st.session_state.vision_output = None
                    st.error(f"All vision providers failed: {e2}")
        else:
            st.session_state.vision_output = None
            st.warning("⚠ Vision Analysis: No images to analyze")

        # Step 6: Indexing (RAG)
        status_text.text("Step 6/6: Building Search Index (RAG)...")
        try:
            indexing_agent = IndexingAgent()

            # Prepare indexing input
            all_images = []
            if st.session_state.media_output and st.session_state.media_output.success:
                all_images = st.session_state.media_output.media.images

            indexing_input = IndexingInput(
                parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [],
                tables=st.session_state.tables_output.tables if st.session_state.tables_output else [],
                images=all_images,
                job_id=st.session_state.job_id
            )

            # Build index
            page_index = indexing_agent.build_index(indexing_input)
            
            # Store in session state (convert Pydantic model to dict for serialization)
            st.session_state.page_index_dict = page_index.model_dump(mode='json')
            st.session_state.page_index_has_data = True

            st.success(f"✓ Index Built: {page_index.metadata.get('total_keywords', 0)} keywords")

        except Exception as e:
            st.warning(f"Indexing failed: {str(e)}")
            st.session_state.page_index_dict = None
            st.session_state.page_index_has_data = False

        progress_bar.progress(100)
        status_text.text("✓ Processing Complete!")

        st.session_state.processing_complete = True

    except Exception as e:
        st.warning(f"Processing error: {str(e)}")
        st.session_state.processing_complete = False

# Step 7: Schema Mapping (optional - for future)
# TODO: Add schema mapping button in Results tab

with tab3:
    st.header("Processing Results")
    
    if not st.session_state.processing_complete:
        st.info("⏳ Processing not complete yet. Go to 'Processing' tab.")
        st.stop()
    
    # Generate Business Profile Button
    st.subheader("🎯 Generate Business Profile")
    st.markdown("Use AI to create a structured business profile from extracted data")
    
    if st.button("🚀 Generate Business Profile with AI", type="primary", use_container_width=True):
        with st.spinner("Generating business profile with Groq AI... Processing each document individually (1-2 minutes)"):
            try:
                from backend.agents.schema_mapping_simple import SchemaMappingAgent
                from backend.models.schemas import SchemaMappingInput
                from backend.agents.validation_agent import ValidationAgent
                from backend.models.schemas import ValidationInput as ValidationInputSchema
                
                # Get page index
                if not st.session_state.get('page_index_dict'):
                    st.error("No index available. Please run processing first.")
                else:
                    from backend.models.schemas import PageIndex
                    page_index = PageIndex.model_validate(st.session_state.page_index_dict)
                    
                    # Step 1: Schema Mapping
                    with st.status("Running Schema Mapping Agent...", expanded=True) as status:
                        agent = SchemaMappingAgent()
                        input_data = SchemaMappingInput(
                            page_index=page_index,
                            job_id=st.session_state.job_id
                        )
                        mapping_output = agent.map_to_schema(input_data)
                        
                        if mapping_output.success and mapping_output.profile:
                            st.success("✅ Schema mapping complete!")
                            status.update(label="Schema Mapping Complete", state="complete")
                        else:
                            st.warning(f"⚠️ Schema mapping had issues: {mapping_output.errors}")
                            status.update(label="Schema Mapping Complete (with warnings)", state="complete")
                    
                    # Step 2: Validation
                    with st.status("Running Validation Agent...", expanded=True) as status:
                        validation_agent = ValidationAgent()
                        validation_input = ValidationInputSchema(
                            profile=mapping_output.profile,
                            job_id=st.session_state.job_id
                        )
                        validation_output = validation_agent.validate(validation_input)
                        
                        st.session_state.validation_result = validation_output.model_dump(mode='json')
                        
                        if validation_output.is_valid:
                            st.success(f"✅ Validation passed! Completeness: {validation_output.completeness_score:.0%}")
                            status.update(label="Validation Complete", state="complete")
                        else:
                            st.warning(f"⚠️ Validation found {len(validation_output.errors)} errors")
                            status.update(label="Validation Complete (errors found)", state="complete")
                    
                    # Store profile
                    if mapping_output.profile:
                        st.session_state.business_profile = mapping_output.profile.model_dump(mode='json')
                        st.success("✅ Business Profile Generated Successfully!")
                        st.info("Go to 'Business Profile' tab to view results")
                    else:
                        st.error("Failed to generate profile")
                        
            except Exception as e:
                st.error(f"Error generating profile: {str(e)}")
                logger.error(f"Schema mapping failed: {e}")
    
    st.divider()
    
    # File Discovery Results
    st.subheader("📁 File Discovery")
    if st.session_state.discovery_output:
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Documents", st.session_state.discovery_output.summary.get('documents_count', 0))
        with col2:
            st.metric("Spreadsheets", st.session_state.discovery_output.summary.get('spreadsheets_count', 0))
        with col3:
            st.metric("Images", st.session_state.discovery_output.summary.get('images_count', 0))
        with col4:
            st.metric("Videos", st.session_state.discovery_output.summary.get('videos_count', 0))
        
        # File list
        with st.expander("📋 View File List"):
            if st.session_state.discovery_output.documents:
                st.write("**Documents:**")
                for doc in st.session_state.discovery_output.documents:
                    st.write(f"- {doc.original_name} ({doc.file_type.value})")
    
    # Document Parsing Results
    st.subheader("📄 Document Parsing")
    if st.session_state.parsing_output and st.session_state.parsing_output.success:
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Pages", st.session_state.parsing_output.total_pages)
        with col2:
            st.metric("Processing Time", f"{st.session_state.parsing_output.processing_time:.1f}s")
        
        # Show extracted text from first document
        with st.expander("📝 View Extracted Text"):
            if st.session_state.parsing_output.parsed_documents:
                doc = st.session_state.parsing_output.parsed_documents[0]
                st.write(f"**Source:** {doc.source_file}")
                st.write(f"**Pages:** {doc.total_pages}")
                if doc.pages and doc.pages[0].text:
                    st.text_area("Text content", doc.pages[0].text[:1000], height=300)
    
    # Table Extraction Results
    st.subheader("📊 Table Extraction")
    if st.session_state.tables_output and st.session_state.tables_output.success:
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Tables Found", st.session_state.tables_output.total_tables)
        with col2:
            st.metric("By Type", str(st.session_state.tables_output.tables_by_type))
        
        # Show tables
        with st.expander("📋 View Tables"):
            for i, table in enumerate(st.session_state.tables_output.tables):
                st.write(f"**Table {i+1}:** {table.table_type.value}")
                st.write(f"Source: {table.source_doc}, Page: {table.source_page}")
                if table.headers:
                    st.write(f"Headers: {', '.join(table.headers)}")

with tab4:
    st.header("🖼️ Vision Analysis (Groq Llama-4-Scout)")

    if not st.session_state.processing_complete:
        st.info("⏳ Processing not complete yet.")
        st.stop()

    if not st.session_state.vision_output:
        st.warning("⚠ No vision analysis available. Either no images were found or analysis failed.")
        st.stop()

    # Show analyzed images
    for i, analysis in enumerate(st.session_state.vision_output):
        st.divider()

        col1, col2 = st.columns([1, 2])

        with col1:
            # Find corresponding image
            if st.session_state.media_output:
                for img in st.session_state.media_output.media.images:
                    if img.image_id == analysis.image_id:
                        try:
                            st.image(img.file_path, caption=analysis.image_id, use_container_width=True)
                        except Exception:
                            st.write(f"Image: {analysis.image_id}")
                        break

        with col2:
            st.subheader(f"Analysis {i+1}")

            # Category badge - handle both str and enum
            category_value = analysis.category
            if hasattr(analysis.category, 'value'):
                category_value = analysis.category.value
            elif isinstance(analysis.category, str):
                category_value = analysis.category.lower()

            category_colors = {
                'product': '🔵',
                'service': '🟢',
                'food': '🟠',
                'destination': '🟣',
                'person': '🔴',
                'document': '⚪',
                'logo': '🟡',
                'other': '⚫'
            }

            category_emoji = category_colors.get(category_value, '⚪')
            st.markdown(f"**Category:** {category_emoji} {category_value}")
            
            # Show provider and confidence
            provider = analysis.metadata.get('provider', 'unknown')
            provider_icon = "🚀" if provider == 'groq' else "🦙"
            st.markdown(f"**Provider:** {provider_icon} {provider.upper()}")
            st.markdown(f"**Confidence:** {analysis.confidence:.0%}")

            # Description
            if analysis.description:
                st.markdown(f"**Description:** {analysis.description}")

            # Tags
            if analysis.tags:
                st.markdown(f"**Tags:** {', '.join(analysis.tags)}")

            # Product/Service flags
            col_a, col_b = st.columns(2)
            with col_a:
                if analysis.is_product:
                    st.success("✓ Product")
            with col_b:
                if analysis.is_service_related:
                    st.info("✓ Service-related")

            # Associations
            if analysis.suggested_associations:
                st.markdown(f"**Associations:** {', '.join(analysis.suggested_associations)}")
            
            # Processing time
            proc_time = analysis.metadata.get('processing_time', 0)
            st.caption(f"Processed in {proc_time:.2f}s")

with tab5:
    st.header("🌳 PageIndex Tree Structure")
    
    if not st.session_state.processing_complete:
        st.info("⏳ Processing not complete yet.")
        st.stop()
    
    if not st.session_state.get('page_index_has_data') or not st.session_state.get('page_index_dict'):
        st.warning("⚠ No index available. Run processing first.")
        st.stop()
    
    # Reconstruct PageIndex from dict
    from backend.models.schemas import PageIndex
    page_index = PageIndex.model_validate(st.session_state.page_index_dict)
    
    # Index Statistics
    st.subheader("📊 Index Statistics")
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Keywords", page_index.metadata.get('total_keywords', 0))
    with col2:
        # Count tree nodes from documents if tree_root is None
        tree_node_count = 0
        if page_index.tree_root:
            tree_node_count = page_index.metadata.get('total_tree_nodes', 0)
        elif page_index.documents:
            tree_node_count = len(page_index.documents)
        st.metric("Tree Nodes", tree_node_count)
    with col3:
        st.metric("Build Time", f"{page_index.metadata.get('build_time_seconds', 0):.2f}s")
    
    st.divider()
    
    # Tree Visualization - Show documents if tree_root is None
    st.subheader("🌲 Document Tree")
    
    if page_index.tree_root and page_index.tree_root.children:
        # Display tree structure
        def display_tree_node(node, level=0):
            """Recursively display tree node"""
            indent = "  " * level
            
            # Display node
            if level == 0:
                st.markdown(f"{indent}**📁 {node.title}**")
            else:
                st.markdown(f"{indent}📄 {node.title}")
            
            # Show details
            if node.keywords:
                keywords_str = ", ".join(node.keywords[:10])  # Show first 10
                if len(node.keywords) > 10:
                    keywords_str += f" ... and {len(node.keywords) - 10} more"
                st.markdown(f"{indent}**Keywords:** {keywords_str}")
            
            if node.start_page and node.end_page:
                st.markdown(f"{indent}**Pages:** {node.start_page}-{node.end_page}")
            
            # Display children
            if node.children:
                for child in node.children:
                    display_tree_node(child, level + 1)
        
        display_tree_node(page_index.tree_root)
    elif page_index.documents:
        # Fallback: Display documents directly
        st.info(f"📄 Displaying {len(page_index.documents)} documents")
        
        for doc_id, doc in page_index.documents.items():
            st.markdown(f"**📄 {os.path.basename(doc.source_file)}**")
            st.markdown(f"  - **Pages:** {doc.total_pages}")
            st.markdown(f"  - **Type:** {doc.file_type.value}")
            st.divider()
    else:
        st.warning("⚠ No documents in index")
    
    # Keyword Search
    st.subheader("🔍 Keyword Search")
    
    search_query = st.text_input("Search keywords:", placeholder="e.g., burger, price, menu")
    
    if search_query and page_index.page_index:
        if search_query.lower() in page_index.page_index:
            refs = page_index.page_index[search_query.lower()]
            st.markdown(f"**Found '{search_query}' in {len(refs)} location(s):**")
            
            for ref in refs[:5]:  # Show first 5
                st.markdown(f"- 📄 Document: `{ref.doc_id}`, Page {ref.page_number}")
                if ref.snippet:
                    st.markdown(f"  > {ref.snippet[:200]}")
        else:
            st.info(f"Keyword '{search_query}' not found in index")

    # Raw Index Data (collapsible)
    with st.expander("📋 View Raw Index Data"):
        st.json({
            'total_keywords': page_index.metadata.get('total_keywords', 0),
            'total_tree_nodes': page_index.metadata.get('total_tree_nodes', 0),
            'sample_keywords': list(page_index.page_index.keys())[:50] if page_index.page_index else []
        })

with tab6:
    st.header("📄 Business Profile")
    
    if not st.session_state.get('business_profile'):
        st.info("👆 Click 'Generate Business Profile with AI' in the Results tab to create a business profile")
        
        st.markdown("""
        ### What is a Business Profile?
        
        A structured digital profile containing:
        
        - **Business Information**: Name, description, location, contact, hours
        - **Product Inventory**: Products with pricing, specifications, inventory
        - **Service Inventory**: Services with pricing, itineraries, FAQs
        - **Data Provenance**: Track where each field came from
        
        ### How It Works:
        
        1. Upload business documents (PDFs, DOCX, images)
        2. Run processing pipeline (6 agents)
        3. Click "Generate Business Profile with AI"
        4. Groq AI extracts and structures the information
        5. View results here!
        """)
    else:
        profile = st.session_state.business_profile
        
        # Business Type Badge
        business_type = profile.get('business_type', 'unknown')
        type_emoji = "🏪" if business_type == 'product' else "💼" if business_type == 'service' else "🏢"
        st.markdown(f"### {type_emoji} Business Type: **{business_type.upper()}**")
        
        # Download JSON button
        profile_json = json.dumps(
            {k: v for k, v in profile.items() if not str(k).startswith('_')},
            indent=2, ensure_ascii=False, default=str
        )
        st.download_button(
            label="📥 Download Profile JSON",
            data=profile_json,
            file_name=f"business_profile_{st.session_state.job_id}.json",
            mime="application/json"
        )
        
        st.divider()
        
        # Business Info
        st.subheader("📊 Business Information")
        business_info = profile.get('business_info', {})
        
        col1, col2 = st.columns(2)
        with col1:
            if business_info.get('name'):
                st.markdown(f"**Name:** {business_info['name']}")
            if business_info.get('description'):
                st.markdown(f"**Description:** {business_info['description']}")
            if business_info.get('category'):
                st.markdown(f"**Category:** {business_info['category']}")
        
        with col2:
            location = business_info.get('location', {})
            if location:
                st.markdown("**Location:**")
                if location.get('address'):
                    st.markdown(f"  - Address: {location['address']}")
                if location.get('city'):
                    st.markdown(f"  - City: {location['city']}")
                if location.get('state'):
                    st.markdown(f"  - State: {location['state']}")
        
        # Contact Info
        contact = business_info.get('contact', {})
        if contact:
            st.markdown("**Contact:**")
            col_a, col_b = st.columns(2)
            with col_a:
                if contact.get('phone'):
                    st.markdown(f"  📞 Phone: {contact['phone']}")
                if contact.get('email'):
                    st.markdown(f"  📧 Email: {contact['email']}")
            with col_b:
                if contact.get('website'):
                    st.markdown(f"  🌐 Website: {contact['website']}")
        
        st.divider()
        
        # Products
        products = profile.get('products', [])
        if products:
            st.subheader(f"📦 Products ({len(products)})")
            for i, product in enumerate(products, 1):
                with st.expander(f"**{i}. {product.get('name', 'Product')}**"):
                    st.write(f"**Description:** {product.get('description', 'N/A')}")
                    if product.get('pricing'):
                        pricing = product['pricing']
                        st.write(f"**Price:** {pricing.get('base_price', 'N/A')} {pricing.get('currency', 'USD')}")
                    if product.get('specifications'):
                        st.write("**Specifications:**")
                        for key, value in product['specifications'].items():
                            if value:
                                st.write(f"  - {key}: {value}")
        
        st.divider()
        
        # ============== SERVICES (COMPREHENSIVE DISPLAY) ==============
        services = profile.get('services', [])
        if services:
            st.subheader(f"💼 Services ({len(services)})")
            
            # Service completeness overview
            st.markdown("**Service Completeness:**")
            for i, service in enumerate(services):
                filled = 0
                total = 13
                for field in ['name', 'description', 'category', 'pricing', 'details',
                              'itinerary', 'inclusions', 'exclusions', 'cancellation_policy',
                              'payment_policy', 'travel_info', 'faqs', 'tags']:
                    val = service.get(field)
                    if val and (not isinstance(val, (list, dict)) or len(val) > 0):
                        filled += 1
                pct = int(filled / total * 100)
                st.progress(pct / 100, text=f"{service.get('name', f'Service {i+1}')}: {pct}% ({filled}/{total} fields)")
            
            st.divider()
            
            # Render each service in detail
            for i, service in enumerate(services, 1):
                svc_name = service.get('name', f'Service {i}')
                with st.expander(f"🏔️ **{i}. {svc_name}**", expanded=(i == 1)):
                    
                    # --- Basic Info ---
                    st.markdown("#### 📋 Basic Information")
                    col1, col2 = st.columns(2)
                    with col1:
                        st.markdown(f"**Name:** {svc_name}")
                        st.markdown(f"**Category:** {service.get('category', 'N/A')}")
                    with col2:
                        if service.get('description'):
                            st.markdown(f"**Description:** {service['description']}")
                    
                    # --- Pricing ---
                    pricing = service.get('pricing')
                    if pricing and isinstance(pricing, dict):
                        st.markdown("#### 💰 Pricing")
                        pcol1, pcol2, pcol3 = st.columns(3)
                        with pcol1:
                            bp = pricing.get('base_price')
                            curr = pricing.get('currency', 'INR')
                            st.metric("Base Price", f"{curr} {bp}" if bp else "N/A")
                        with pcol2:
                            st.markdown(f"**Price Type:** {pricing.get('price_type', 'N/A')}")
                        with pcol3:
                            dp = pricing.get('discount_price')
                            if dp:
                                st.metric("Discount Price", f"{curr} {dp}")
                    
                    # --- Trek Details ---
                    details = service.get('details')
                    if details and isinstance(details, dict):
                        st.markdown("#### 🏔️ Trek Details")
                        dcol1, dcol2, dcol3 = st.columns(3)
                        with dcol1:
                            if details.get('duration'):
                                st.markdown(f"⏱️ **Duration:** {details['duration']}")
                            if details.get('difficulty_level'):
                                diff = details['difficulty_level']
                                diff_emoji = "🟢" if 'easy' in diff.lower() else "🟡" if 'moderate' in diff.lower() else "🔴"
                                st.markdown(f"{diff_emoji} **Difficulty:** {diff}")
                        with dcol2:
                            if details.get('max_altitude'):
                                st.markdown(f"🏔️ **Max Altitude:** {details['max_altitude']}")
                            if details.get('total_distance'):
                                st.markdown(f"📏 **Distance:** {details['total_distance']}")
                        with dcol3:
                            if details.get('starting_point'):
                                st.markdown(f"📍 **Start:** {details['starting_point']}")
                            if details.get('ending_point'):
                                st.markdown(f"📍 **End:** {details['ending_point']}")
                        
                        if details.get('group_size'):
                            st.markdown(f"👥 **Group Size:** {details['group_size']}")
                        if details.get('best_time'):
                            st.markdown(f"📅 **Best Time:** {details['best_time']}")
                    
                    # --- Itinerary ---
                    itinerary = service.get('itinerary', [])
                    if itinerary and isinstance(itinerary, list) and len(itinerary) > 0:
                        st.markdown(f"#### 🗓️ Day-by-Day Itinerary ({len(itinerary)} days)")
                        
                        for day_data in itinerary:
                            if isinstance(day_data, dict):
                                day_num = day_data.get('day', '?')
                                day_title = day_data.get('title', day_data.get('description', 'N/A'))
                                day_desc = day_data.get('description', '')
                                day_alt = day_data.get('altitude', '')
                                day_dist = day_data.get('distance', '')
                                
                                header = f"**Day {day_num}: {day_title}**"
                                if day_alt:
                                    header += f" | 🏔️ {day_alt}"
                                if day_dist:
                                    header += f" | 📏 {day_dist}"
                                
                                st.markdown(header)
                                if day_desc and day_desc != day_title:
                                    st.caption(day_desc)
                                
                                # Show activities if present
                                activities = day_data.get('activities', [])
                                if activities and isinstance(activities, list):
                                    st.markdown("  " + " → ".join(activities))
                                
                                # Show meals if present
                                meals = day_data.get('meals', [])
                                if meals and isinstance(meals, list):
                                    st.markdown(f"  🍽️ Meals: {', '.join(meals)}")
                                
                                # Show accommodation if present
                                accommodation = day_data.get('accommodation')
                                if accommodation:
                                    st.markdown(f"  🏠 Stay: {accommodation}")
                    else:
                        st.markdown("#### 🗓️ Itinerary")
                        st.caption("No itinerary data extracted")
                    
                    # --- Inclusions & Exclusions ---
                    incl = service.get('inclusions', [])
                    excl = service.get('exclusions', [])
                    if incl or excl:
                        st.markdown("#### ✅ Inclusions & ❌ Exclusions")
                        icol1, icol2 = st.columns(2)
                        with icol1:
                            if incl and isinstance(incl, list):
                                st.markdown("**✅ Included:**")
                                for item in incl:
                                    st.markdown(f"  ✓ {item}")
                            else:
                                st.caption("No inclusions data")
                        with icol2:
                            if excl and isinstance(excl, list):
                                st.markdown("**❌ Excluded:**")
                                for item in excl:
                                    st.markdown(f"  ✗ {item}")
                            else:
                                st.caption("No exclusions data")
                    
                    # --- Policies ---
                    cancel_policy = service.get('cancellation_policy')
                    pay_policy = service.get('payment_policy')
                    if cancel_policy or pay_policy:
                        st.markdown("#### 📜 Policies")
                        if cancel_policy:
                            st.markdown(f"**Cancellation Policy:** {cancel_policy}")
                        if pay_policy:
                            st.markdown(f"**Payment Policy:** {pay_policy}")
                    
                    # --- Travel Info ---
                    travel = service.get('travel_info')
                    if travel and isinstance(travel, dict) and any(travel.values()):
                        st.markdown("#### 🚂 Travel Information")
                        if travel.get('how_to_reach'):
                            st.markdown(f"**How to Reach:** {travel['how_to_reach']}")
                        tcol1, tcol2 = st.columns(2)
                        with tcol1:
                            if travel.get('nearest_railway'):
                                st.markdown(f"🚆 **Railway:** {travel['nearest_railway']}")
                        with tcol2:
                            if travel.get('nearest_airport'):
                                st.markdown(f"✈️ **Airport:** {travel['nearest_airport']}")
                        landmarks = travel.get('nearby_landmarks', [])
                        if landmarks and isinstance(landmarks, list):
                            st.markdown(f"📍 **Landmarks:** {', '.join(landmarks)}")
                    
                    # --- FAQs ---
                    faqs = service.get('faqs', [])
                    if faqs and isinstance(faqs, list) and len(faqs) > 0:
                        st.markdown(f"#### ❓ FAQs ({len(faqs)})")
                        for faq in faqs:
                            if isinstance(faq, dict):
                                st.markdown(f"**Q: {faq.get('question', 'N/A')}**")
                                st.markdown(f"A: {faq.get('answer', 'N/A')}")
                    
                    # --- What to Carry ---
                    carry = service.get('what_to_carry', [])
                    if carry and isinstance(carry, list) and len(carry) > 0:
                        st.markdown("#### 🎒 What to Carry")
                        ccol1, ccol2 = st.columns(2)
                        half = len(carry) // 2 + 1
                        with ccol1:
                            for item in carry[:half]:
                                st.markdown(f"  • {item}")
                        with ccol2:
                            for item in carry[half:]:
                                st.markdown(f"  • {item}")
                    
                    # --- Risk & Safety ---
                    risk = service.get('risk_and_safety')
                    if risk:
                        st.markdown("#### ⚠️ Risk & Safety")
                        st.warning(risk)
                    
                    # --- Tags ---
                    tags = service.get('tags', [])
                    if tags and isinstance(tags, list):
                        st.markdown("#### 🏷️ Tags")
                        st.markdown(" ".join([f"`{tag}`" for tag in tags]))
        else:
            st.info("No services extracted")
        
        st.divider()
        
        # Metadata
        st.subheader("📋 Extraction Metadata")
        metadata = profile.get('extraction_metadata', {})
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Processing Time", f"{metadata.get('processing_time', 0):.2f}s")
        with col2:
            st.metric("Source Files", metadata.get('source_files_count', 0))
        with col3:
            st.metric("Confidence", f"{metadata.get('confidence_score', 0):.0%}")
        with col4:
            st.metric("LLM Calls", metadata.get('llm_calls_made', 0))
        
        st.markdown(f"**Method:** {metadata.get('extraction_method', 'unknown')}")
        st.markdown(f"**Version:** {metadata.get('version', '1.0')}")
        
        # Raw JSON viewer
        with st.expander("🔍 View Raw Profile JSON"):
            st.json(profile)

# Footer
st.divider()
st.markdown("""
<div style="text-align: center; color: #666; padding: 1rem;">
    <b>Digi-Biz</b> - Agentic Business Digitization Framework<br>
    Powered by Groq Vision (Llama-4-Scout) • Ollama Fallback • Multi-Agent Pipeline
</div>
""", unsafe_allow_html=True)