| """ |
| Digi-Biz: Agentic Business Digitization Framework |
| Streamlit Demo Application |
| |
| This app demonstrates the complete workflow: |
| 1. Upload ZIP with business documents |
| 2. File Discovery Agent extracts and classifies files |
| 3. Document Parsing Agent extracts text and tables |
| 4. Media Extraction Agent extracts images |
| 5. Vision Agent (Groq Llama-4-Scout) analyzes images |
| 6. View results |
| """ |
| import streamlit as st |
| import os |
| import tempfile |
| import shutil |
| from pathlib import Path |
| from datetime import datetime |
| import json |
| import io |
| from PIL import Image |
| from backend.utils.logger import get_logger |
|
|
| logger = get_logger(__name__) |
|
|
| |
| from dotenv import load_dotenv |
| env_path = Path(__file__).parent / ".env" |
| if env_path.exists(): |
| load_dotenv(env_path) |
|
|
| |
| try: |
| from groq import Groq |
| GROQ_AVAILABLE = True |
| except ImportError: |
| GROQ_AVAILABLE = False |
|
|
| |
| from backend.agents.file_discovery import FileDiscoveryAgent, FileDiscoveryInput |
| from backend.agents.document_parsing import DocumentParsingAgent, DocumentParsingInput |
| from backend.agents.table_extraction import TableExtractionAgent, TableExtractionInput |
| from backend.agents.media_extraction import MediaExtractionAgent, MediaExtractionInput |
| from backend.agents.vision_agent import VisionAgent, VisionAnalysisInput |
| from backend.agents.indexing import IndexingAgent, IndexingInput |
| from backend.utils.storage_manager import StorageManager |
|
|
|
|
| |
| |
| |
| st.set_page_config( |
| page_title="Digi-Biz - Business Digitization", |
| page_icon="π", |
| layout="wide", |
| initial_sidebar_state="expanded" |
| ) |
|
|
| |
| st.markdown(""" |
| <style> |
| .main-header { |
| font-size: 2.5rem; |
| font-weight: bold; |
| color: #1E88E5; |
| text-align: center; |
| margin-bottom: 1rem; |
| } |
| .sub-header { |
| font-size: 1.2rem; |
| color: #666; |
| text-align: center; |
| margin-bottom: 2rem; |
| } |
| .success-box { |
| padding: 1rem; |
| border-radius: 0.5rem; |
| background-color: #E8F5E9; |
| border-left: 4px solid #4CAF50; |
| margin: 1rem 0; |
| } |
| .info-box { |
| padding: 1rem; |
| border-radius: 0.5rem; |
| background-color: #E3F2FD; |
| border-left: 4px solid #2196F3; |
| margin: 1rem 0; |
| } |
| .agent-card { |
| padding: 1rem; |
| border-radius: 0.5rem; |
| background-color: #f5f5f5; |
| margin: 0.5rem 0; |
| } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
|
|
| |
| |
| |
| if 'job_id' not in st.session_state: |
| st.session_state.job_id = "" |
| if 'discovery_output' not in st.session_state: |
| st.session_state.discovery_output = None |
| if 'parsing_output' not in st.session_state: |
| st.session_state.parsing_output = None |
| if 'tables_output' not in st.session_state: |
| st.session_state.tables_output = None |
| if 'media_output' not in st.session_state: |
| st.session_state.media_output = None |
| if 'vision_output' not in st.session_state: |
| st.session_state.vision_output = None |
| if 'processing_complete' not in st.session_state: |
| st.session_state.processing_complete = False |
|
|
|
|
| |
| |
| |
| def generate_job_id(): |
| """Generate unique job ID""" |
| return f"job_{datetime.now().strftime('%Y%m%d_%H%M%S')}" |
|
|
|
|
| def cleanup_temp_dirs(): |
| """Clean up temporary directories""" |
| temp_base = Path(tempfile.gettempdir()) / "digi_biz" |
| if temp_base.exists(): |
| shutil.rmtree(temp_base) |
|
|
|
|
| def get_model_status(): |
| """Check if Ollama and Qwen model are available""" |
| try: |
| from ollama import Client |
| client = Client(host='http://localhost:11434', timeout=5) |
| response = client.list() |
| |
| if isinstance(response, dict) and 'models' in response: |
| models = [m['name'] for m in response['models']] |
| elif hasattr(response, 'models'): |
| models = [m.name if hasattr(m, 'name') else m['name'] for m in response.models] |
| else: |
| models = [] |
| |
| ollama_ok = True |
| qwen_available = any('qwen3.5' in m for m in models) |
| |
| |
| vision_working = False |
| if qwen_available: |
| try: |
| |
| test_client = Client(host='http://localhost:11434', timeout=30) |
| test_img = Image.new('RGB', (50, 50), color='red') |
| test_bytes = io.BytesIO() |
| test_img.save(test_bytes, format='PNG') |
| |
| test_response = test_client.chat( |
| model='qwen3.5:0.8b', |
| messages=[{ |
| 'role': 'user', |
| 'content': 'What color?', |
| 'images': [test_bytes.getvalue()] |
| }], |
| options={'timeout': 20000} |
| ) |
| |
| vision_working = len(test_response['message']['content'].strip()) > 10 |
| except Exception: |
| vision_working = False |
| |
| return ollama_ok, qwen_available, vision_working, models |
| |
| except Exception: |
| return False, False, False, [] |
|
|
|
|
| |
| |
| |
|
|
| |
| st.markdown('<h1 class="main-header">π Digi-Biz</h1>', unsafe_allow_html=True) |
| st.markdown('<p class="sub-header">Agentic Business Digitization Framework</p>', unsafe_allow_html=True) |
|
|
| |
| with st.sidebar: |
| st.header("π§ Configuration") |
| |
| |
| st.subheader("Model Status") |
| |
| |
| groq_ok = False |
| groq_model = "N/A" |
| groq_error = "" |
| |
| try: |
| api_key = os.getenv("GROQ_API_KEY") |
| |
| if not api_key: |
| groq_error = "GROQ_API_KEY not set in .env" |
| elif api_key == "gsk_YOUR_API_KEY_HERE": |
| groq_error = "Using placeholder key" |
| else: |
| |
| client = Groq(api_key=api_key, timeout=5) |
| models = client.models.list() |
| groq_ok = True |
| groq_model = "llama-4-scout-17b" |
| except ImportError: |
| groq_error = "groq package not installed" |
| except Exception as e: |
| groq_error = str(e)[:100] |
| |
| if groq_ok: |
| st.success(f"β Groq API: {groq_model}") |
| else: |
| st.error("β Groq API Not Available") |
| st.code(groq_error, language=None) |
| st.info("Fix: Get key from https://console.groq.com and add to .env file") |
| |
| |
| ollama_ok = False |
| try: |
| from ollama import Client |
| client = Client(host='http://localhost:11434', timeout=5) |
| client.list() |
| ollama_ok = True |
| except Exception: |
| pass |
| |
| if ollama_ok: |
| st.success("β Ollama: Fallback Ready") |
| else: |
| st.warning("β Ollama: Not Running (optional)") |
| |
| st.divider() |
| |
| |
| st.subheader("Agents") |
| st.markdown(""" |
| <div class="agent-card"> |
| <b>1. File Discovery</b><br> |
| <small>Extracts & classifies files from ZIP</small> |
| </div> |
| |
| <div class="agent-card"> |
| <b>2. Document Parsing</b><br> |
| <small>Extracts text from PDF/DOCX</small> |
| </div> |
| |
| <div class="agent-card"> |
| <b>3. Table Extraction</b><br> |
| <small>Detects & classifies tables</small> |
| </div> |
| |
| <div class="agent-card"> |
| <b>4. Media Extraction</b><br> |
| <small>Extracts embedded images</small> |
| </div> |
| |
| <div class="agent-card"> |
| <b>5. Vision Agent</b><br> |
| <small>Analyzes images with Groq</small> |
| </div> |
| |
| <div class="agent-card"> |
| <b>6. Indexing Agent</b><br> |
| <small>Builds RAG search index</small> |
| </div> |
| """, unsafe_allow_html=True) |
| |
| st.divider() |
| |
| |
| if st.button("π Reset All", use_container_width=True): |
| cleanup_temp_dirs() |
| for key in list(st.session_state.keys()): |
| st.session_state[key] = None |
| st.session_state.processing_complete = False |
| st.rerun() |
|
|
| |
| tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["π€ Upload", "βοΈ Processing", "π Results", "πΌοΈ Vision Analysis", "π³ Index Tree", "π Business Profile"]) |
|
|
| with tab1: |
| st.header("Upload Business Documents") |
| |
| st.markdown(""" |
| **Supported Formats:** |
| - π Documents: PDF, DOCX, DOC |
| - π Spreadsheets: XLSX, XLS, CSV |
| - πΌοΈ Images: JPG, PNG, GIF, WEBP |
| - π₯ Videos: MP4, AVI, MOV |
| |
| **Instructions:** |
| 1. Create a ZIP file with your business documents |
| 2. Upload using the file uploader below |
| 3. Click "Start Processing" |
| """) |
| |
| uploaded_file = st.file_uploader( |
| "Upload ZIP file", |
| type=['zip'], |
| help="Select a ZIP file containing business documents" |
| ) |
| |
| if uploaded_file: |
| st.success(f"β Uploaded: {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)") |
| |
| |
| temp_dir = Path(tempfile.gettempdir()) / "digi_biz" / generate_job_id() |
| temp_dir.mkdir(parents=True, exist_ok=True) |
| |
| zip_path = temp_dir / uploaded_file.name |
| with open(zip_path, 'wb') as f: |
| f.write(uploaded_file.getvalue()) |
| |
| st.session_state.zip_path = str(zip_path) |
| st.session_state.job_id = temp_dir.name |
| |
| st.info(f"Job ID: `{st.session_state.job_id}`") |
| |
| |
| if st.button("π Start Processing", type="primary", use_container_width=True): |
| st.session_state.processing_started = True |
| st.rerun() |
|
|
| with tab2: |
| st.header("Processing Pipeline") |
| |
| if not hasattr(st.session_state, 'processing_started') or not st.session_state.processing_started: |
| st.info("π Upload a ZIP file and click 'Start Processing'") |
| st.stop() |
| |
| progress_bar = st.progress(0) |
| status_text = st.empty() |
| |
| |
| status_text.text("Step 1/5: File Discovery Agent...") |
| try: |
| storage_manager = StorageManager(storage_base=str(Path(tempfile.gettempdir()) / "digi_biz" / st.session_state.job_id)) |
| |
| discovery_agent = FileDiscoveryAgent(storage_manager=storage_manager) |
| discovery_input = FileDiscoveryInput( |
| zip_file_path=st.session_state.zip_path, |
| job_id=st.session_state.job_id |
| ) |
| st.session_state.discovery_output = discovery_agent.discover(discovery_input) |
| |
| progress_bar.progress(20) |
| |
| if st.session_state.discovery_output.success: |
| st.success(f"β File Discovery Complete: {st.session_state.discovery_output.total_files} files") |
| st.markdown(f""" |
| <div class="success-box"> |
| <b>Summary:</b><br> |
| β’ Documents: {st.session_state.discovery_output.summary.get('documents_count', 0)}<br> |
| β’ Spreadsheets: {st.session_state.discovery_output.summary.get('spreadsheets_count', 0)}<br> |
| β’ Images: {st.session_state.discovery_output.summary.get('images_count', 0)}<br> |
| β’ Videos: {st.session_state.discovery_output.summary.get('videos_count', 0)} |
| </div> |
| """, unsafe_allow_html=True) |
| else: |
| st.error(f"β File Discovery Failed: {st.session_state.discovery_output.errors}") |
| st.stop() |
| |
| except Exception as e: |
| st.error(f"File Discovery Error: {str(e)}") |
| st.stop() |
| |
| |
| status_text.text("Step 2/5: Document Parsing Agent...") |
| try: |
| parsing_agent = DocumentParsingAgent(enable_ocr=False) |
| parsing_input = DocumentParsingInput( |
| documents=st.session_state.discovery_output.documents, |
| job_id=st.session_state.job_id, |
| enable_ocr=False |
| ) |
| st.session_state.parsing_output = parsing_agent.parse(parsing_input) |
| |
| progress_bar.progress(40) |
| |
| if st.session_state.parsing_output.success: |
| st.success(f"β Document Parsing Complete: {st.session_state.parsing_output.total_pages} pages") |
| else: |
| st.warning("β Document Parsing: No documents to parse") |
| |
| except Exception as e: |
| st.warning(f"Document Parsing: {str(e)}") |
| |
| |
| status_text.text("Step 3/5: Table Extraction Agent...") |
| try: |
| table_agent = TableExtractionAgent() |
| table_input = TableExtractionInput( |
| parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [], |
| job_id=st.session_state.job_id |
| ) |
| st.session_state.tables_output = table_agent.extract(table_input) |
| |
| progress_bar.progress(60) |
| |
| if st.session_state.tables_output.success: |
| st.success(f"β Table Extraction Complete: {st.session_state.tables_output.total_tables} tables") |
| if st.session_state.tables_output.tables_by_type: |
| types_str = ", ".join([f"{k}: {v}" for k, v in st.session_state.tables_output.tables_by_type.items()]) |
| st.info(f"Types: {types_str}") |
| else: |
| st.warning("β Table Extraction: No tables found") |
| |
| except Exception as e: |
| st.warning(f"Table Extraction: {str(e)}") |
| |
| |
| status_text.text("Step 4/5: Media Extraction Agent...") |
| try: |
| media_agent = MediaExtractionAgent(enable_deduplication=True) |
| media_input = MediaExtractionInput( |
| parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [], |
| standalone_files=[img.file_path for img in st.session_state.discovery_output.images] if st.session_state.discovery_output else [], |
| job_id=st.session_state.job_id |
| ) |
| st.session_state.media_output = media_agent.extract_all(media_input) |
| |
| progress_bar.progress(80) |
| |
| if st.session_state.media_output.success: |
| st.success(f"β Media Extraction Complete: {st.session_state.media_output.total_images} images") |
| if st.session_state.media_output.duplicates_removed > 0: |
| st.info(f"Removed {st.session_state.media_output.duplicates_removed} duplicates") |
| else: |
| st.warning("β Media Extraction: No images found") |
| |
| except Exception as e: |
| st.warning(f"Media Extraction: {str(e)}") |
| |
| |
| status_text.text("Step 5/5: Vision Agent (Groq Llama-4-Scout)...") |
| try: |
| |
| from backend.agents.vision_agent import VisionAgent |
| |
| vision_agent = VisionAgent(provider="groq", timeout=120) |
| |
| |
| images_to_analyze = [] |
| if st.session_state.media_output and st.session_state.media_output.success: |
| images_to_analyze = st.session_state.media_output.media.images[:5] |
|
|
| if images_to_analyze: |
| st.info(f"Analyzing {len(images_to_analyze)} images with Groq Vision (Llama-4-Scout)...") |
| progress_vision = st.progress(0) |
|
|
| try: |
| |
| analyses = vision_agent.analyze_batch(images_to_analyze) |
| st.session_state.vision_output = analyses |
|
|
| progress_vision.progress(100) |
| st.success(f"β Vision Analysis Complete: {len(analyses)} images analyzed") |
|
|
| |
| if analyses: |
| categories = {} |
| for a in analyses: |
| cat = a.category.value |
| categories[cat] = categories.get(cat, 0) + 1 |
|
|
| st.markdown("**Categories Detected:**") |
| cat_text = ", ".join([f"{k}: {v}" for k, v in categories.items()]) |
| st.info(cat_text) |
|
|
| except Exception as ve: |
| st.warning(f"Vision analysis failed: {str(ve)}") |
| st.info("Falling back to Ollama...") |
|
|
| |
| try: |
| vision_agent_ollama = VisionAgent(provider="ollama", timeout=120) |
| analyses = vision_agent_ollama.analyze_batch(images_to_analyze) |
| st.session_state.vision_output = analyses |
| st.success(f"β Vision Analysis Complete (via Ollama): {len(analyses)} images") |
| except Exception as e2: |
| st.session_state.vision_output = None |
| st.error(f"All vision providers failed: {e2}") |
| else: |
| st.session_state.vision_output = None |
| st.warning("β Vision Analysis: No images to analyze") |
|
|
| |
| status_text.text("Step 6/6: Building Search Index (RAG)...") |
| try: |
| indexing_agent = IndexingAgent() |
|
|
| |
| all_images = [] |
| if st.session_state.media_output and st.session_state.media_output.success: |
| all_images = st.session_state.media_output.media.images |
|
|
| indexing_input = IndexingInput( |
| parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [], |
| tables=st.session_state.tables_output.tables if st.session_state.tables_output else [], |
| images=all_images, |
| job_id=st.session_state.job_id |
| ) |
|
|
| |
| page_index = indexing_agent.build_index(indexing_input) |
| |
| |
| st.session_state.page_index_dict = page_index.model_dump(mode='json') |
| st.session_state.page_index_has_data = True |
|
|
| st.success(f"β Index Built: {page_index.metadata.get('total_keywords', 0)} keywords") |
|
|
| except Exception as e: |
| st.warning(f"Indexing failed: {str(e)}") |
| st.session_state.page_index_dict = None |
| st.session_state.page_index_has_data = False |
|
|
| progress_bar.progress(100) |
| status_text.text("β Processing Complete!") |
|
|
| st.session_state.processing_complete = True |
|
|
| except Exception as e: |
| st.warning(f"Processing error: {str(e)}") |
| st.session_state.processing_complete = False |
|
|
| |
| |
|
|
| with tab3: |
| st.header("Processing Results") |
| |
| if not st.session_state.processing_complete: |
| st.info("β³ Processing not complete yet. Go to 'Processing' tab.") |
| st.stop() |
| |
| |
| st.subheader("π― Generate Business Profile") |
| st.markdown("Use AI to create a structured business profile from extracted data") |
| |
| if st.button("π Generate Business Profile with AI", type="primary", use_container_width=True): |
| with st.spinner("Generating business profile with Groq AI... Processing each document individually (1-2 minutes)"): |
| try: |
| from backend.agents.schema_mapping_simple import SchemaMappingAgent |
| from backend.models.schemas import SchemaMappingInput |
| from backend.agents.validation_agent import ValidationAgent |
| from backend.models.schemas import ValidationInput as ValidationInputSchema |
| |
| |
| if not st.session_state.get('page_index_dict'): |
| st.error("No index available. Please run processing first.") |
| else: |
| from backend.models.schemas import PageIndex |
| page_index = PageIndex.model_validate(st.session_state.page_index_dict) |
| |
| |
| with st.status("Running Schema Mapping Agent...", expanded=True) as status: |
| agent = SchemaMappingAgent() |
| input_data = SchemaMappingInput( |
| page_index=page_index, |
| job_id=st.session_state.job_id |
| ) |
| mapping_output = agent.map_to_schema(input_data) |
| |
| if mapping_output.success and mapping_output.profile: |
| st.success("β
Schema mapping complete!") |
| status.update(label="Schema Mapping Complete", state="complete") |
| else: |
| st.warning(f"β οΈ Schema mapping had issues: {mapping_output.errors}") |
| status.update(label="Schema Mapping Complete (with warnings)", state="complete") |
| |
| |
| with st.status("Running Validation Agent...", expanded=True) as status: |
| validation_agent = ValidationAgent() |
| validation_input = ValidationInputSchema( |
| profile=mapping_output.profile, |
| job_id=st.session_state.job_id |
| ) |
| validation_output = validation_agent.validate(validation_input) |
| |
| st.session_state.validation_result = validation_output.model_dump(mode='json') |
| |
| if validation_output.is_valid: |
| st.success(f"β
Validation passed! Completeness: {validation_output.completeness_score:.0%}") |
| status.update(label="Validation Complete", state="complete") |
| else: |
| st.warning(f"β οΈ Validation found {len(validation_output.errors)} errors") |
| status.update(label="Validation Complete (errors found)", state="complete") |
| |
| |
| if mapping_output.profile: |
| st.session_state.business_profile = mapping_output.profile.model_dump(mode='json') |
| st.success("β
Business Profile Generated Successfully!") |
| st.info("Go to 'Business Profile' tab to view results") |
| else: |
| st.error("Failed to generate profile") |
| |
| except Exception as e: |
| st.error(f"Error generating profile: {str(e)}") |
| logger.error(f"Schema mapping failed: {e}") |
| |
| st.divider() |
| |
| |
| st.subheader("π File Discovery") |
| if st.session_state.discovery_output: |
| col1, col2, col3, col4 = st.columns(4) |
| with col1: |
| st.metric("Documents", st.session_state.discovery_output.summary.get('documents_count', 0)) |
| with col2: |
| st.metric("Spreadsheets", st.session_state.discovery_output.summary.get('spreadsheets_count', 0)) |
| with col3: |
| st.metric("Images", st.session_state.discovery_output.summary.get('images_count', 0)) |
| with col4: |
| st.metric("Videos", st.session_state.discovery_output.summary.get('videos_count', 0)) |
| |
| |
| with st.expander("π View File List"): |
| if st.session_state.discovery_output.documents: |
| st.write("**Documents:**") |
| for doc in st.session_state.discovery_output.documents: |
| st.write(f"- {doc.original_name} ({doc.file_type.value})") |
| |
| |
| st.subheader("π Document Parsing") |
| if st.session_state.parsing_output and st.session_state.parsing_output.success: |
| col1, col2 = st.columns(2) |
| with col1: |
| st.metric("Pages", st.session_state.parsing_output.total_pages) |
| with col2: |
| st.metric("Processing Time", f"{st.session_state.parsing_output.processing_time:.1f}s") |
| |
| |
| with st.expander("π View Extracted Text"): |
| if st.session_state.parsing_output.parsed_documents: |
| doc = st.session_state.parsing_output.parsed_documents[0] |
| st.write(f"**Source:** {doc.source_file}") |
| st.write(f"**Pages:** {doc.total_pages}") |
| if doc.pages and doc.pages[0].text: |
| st.text_area("Text content", doc.pages[0].text[:1000], height=300) |
| |
| |
| st.subheader("π Table Extraction") |
| if st.session_state.tables_output and st.session_state.tables_output.success: |
| col1, col2 = st.columns(2) |
| with col1: |
| st.metric("Tables Found", st.session_state.tables_output.total_tables) |
| with col2: |
| st.metric("By Type", str(st.session_state.tables_output.tables_by_type)) |
| |
| |
| with st.expander("π View Tables"): |
| for i, table in enumerate(st.session_state.tables_output.tables): |
| st.write(f"**Table {i+1}:** {table.table_type.value}") |
| st.write(f"Source: {table.source_doc}, Page: {table.source_page}") |
| if table.headers: |
| st.write(f"Headers: {', '.join(table.headers)}") |
|
|
| with tab4: |
| st.header("πΌοΈ Vision Analysis (Groq Llama-4-Scout)") |
|
|
| if not st.session_state.processing_complete: |
| st.info("β³ Processing not complete yet.") |
| st.stop() |
|
|
| if not st.session_state.vision_output: |
| st.warning("β No vision analysis available. Either no images were found or analysis failed.") |
| st.stop() |
|
|
| |
| for i, analysis in enumerate(st.session_state.vision_output): |
| st.divider() |
|
|
| col1, col2 = st.columns([1, 2]) |
|
|
| with col1: |
| |
| if st.session_state.media_output: |
| for img in st.session_state.media_output.media.images: |
| if img.image_id == analysis.image_id: |
| try: |
| st.image(img.file_path, caption=analysis.image_id, use_container_width=True) |
| except Exception: |
| st.write(f"Image: {analysis.image_id}") |
| break |
|
|
| with col2: |
| st.subheader(f"Analysis {i+1}") |
|
|
| |
| category_value = analysis.category |
| if hasattr(analysis.category, 'value'): |
| category_value = analysis.category.value |
| elif isinstance(analysis.category, str): |
| category_value = analysis.category.lower() |
|
|
| category_colors = { |
| 'product': 'π΅', |
| 'service': 'π’', |
| 'food': 'π ', |
| 'destination': 'π£', |
| 'person': 'π΄', |
| 'document': 'βͺ', |
| 'logo': 'π‘', |
| 'other': 'β«' |
| } |
|
|
| category_emoji = category_colors.get(category_value, 'βͺ') |
| st.markdown(f"**Category:** {category_emoji} {category_value}") |
| |
| |
| provider = analysis.metadata.get('provider', 'unknown') |
| provider_icon = "π" if provider == 'groq' else "π¦" |
| st.markdown(f"**Provider:** {provider_icon} {provider.upper()}") |
| st.markdown(f"**Confidence:** {analysis.confidence:.0%}") |
|
|
| |
| if analysis.description: |
| st.markdown(f"**Description:** {analysis.description}") |
|
|
| |
| if analysis.tags: |
| st.markdown(f"**Tags:** {', '.join(analysis.tags)}") |
|
|
| |
| col_a, col_b = st.columns(2) |
| with col_a: |
| if analysis.is_product: |
| st.success("β Product") |
| with col_b: |
| if analysis.is_service_related: |
| st.info("β Service-related") |
|
|
| |
| if analysis.suggested_associations: |
| st.markdown(f"**Associations:** {', '.join(analysis.suggested_associations)}") |
| |
| |
| proc_time = analysis.metadata.get('processing_time', 0) |
| st.caption(f"Processed in {proc_time:.2f}s") |
|
|
| with tab5: |
| st.header("π³ PageIndex Tree Structure") |
| |
| if not st.session_state.processing_complete: |
| st.info("β³ Processing not complete yet.") |
| st.stop() |
| |
| if not st.session_state.get('page_index_has_data') or not st.session_state.get('page_index_dict'): |
| st.warning("β No index available. Run processing first.") |
| st.stop() |
| |
| |
| from backend.models.schemas import PageIndex |
| page_index = PageIndex.model_validate(st.session_state.page_index_dict) |
| |
| |
| st.subheader("π Index Statistics") |
| col1, col2, col3 = st.columns(3) |
| with col1: |
| st.metric("Total Keywords", page_index.metadata.get('total_keywords', 0)) |
| with col2: |
| |
| tree_node_count = 0 |
| if page_index.tree_root: |
| tree_node_count = page_index.metadata.get('total_tree_nodes', 0) |
| elif page_index.documents: |
| tree_node_count = len(page_index.documents) |
| st.metric("Tree Nodes", tree_node_count) |
| with col3: |
| st.metric("Build Time", f"{page_index.metadata.get('build_time_seconds', 0):.2f}s") |
| |
| st.divider() |
| |
| |
| st.subheader("π² Document Tree") |
| |
| if page_index.tree_root and page_index.tree_root.children: |
| |
| def display_tree_node(node, level=0): |
| """Recursively display tree node""" |
| indent = " " * level |
| |
| |
| if level == 0: |
| st.markdown(f"{indent}**π {node.title}**") |
| else: |
| st.markdown(f"{indent}π {node.title}") |
| |
| |
| if node.keywords: |
| keywords_str = ", ".join(node.keywords[:10]) |
| if len(node.keywords) > 10: |
| keywords_str += f" ... and {len(node.keywords) - 10} more" |
| st.markdown(f"{indent}**Keywords:** {keywords_str}") |
| |
| if node.start_page and node.end_page: |
| st.markdown(f"{indent}**Pages:** {node.start_page}-{node.end_page}") |
| |
| |
| if node.children: |
| for child in node.children: |
| display_tree_node(child, level + 1) |
| |
| display_tree_node(page_index.tree_root) |
| elif page_index.documents: |
| |
| st.info(f"π Displaying {len(page_index.documents)} documents") |
| |
| for doc_id, doc in page_index.documents.items(): |
| st.markdown(f"**π {os.path.basename(doc.source_file)}**") |
| st.markdown(f" - **Pages:** {doc.total_pages}") |
| st.markdown(f" - **Type:** {doc.file_type.value}") |
| st.divider() |
| else: |
| st.warning("β No documents in index") |
| |
| |
| st.subheader("π Keyword Search") |
| |
| search_query = st.text_input("Search keywords:", placeholder="e.g., burger, price, menu") |
| |
| if search_query and page_index.page_index: |
| if search_query.lower() in page_index.page_index: |
| refs = page_index.page_index[search_query.lower()] |
| st.markdown(f"**Found '{search_query}' in {len(refs)} location(s):**") |
| |
| for ref in refs[:5]: |
| st.markdown(f"- π Document: `{ref.doc_id}`, Page {ref.page_number}") |
| if ref.snippet: |
| st.markdown(f" > {ref.snippet[:200]}") |
| else: |
| st.info(f"Keyword '{search_query}' not found in index") |
|
|
| |
| with st.expander("π View Raw Index Data"): |
| st.json({ |
| 'total_keywords': page_index.metadata.get('total_keywords', 0), |
| 'total_tree_nodes': page_index.metadata.get('total_tree_nodes', 0), |
| 'sample_keywords': list(page_index.page_index.keys())[:50] if page_index.page_index else [] |
| }) |
|
|
| with tab6: |
| st.header("π Business Profile") |
| |
| if not st.session_state.get('business_profile'): |
| st.info("π Click 'Generate Business Profile with AI' in the Results tab to create a business profile") |
| |
| st.markdown(""" |
| ### What is a Business Profile? |
| |
| A structured digital profile containing: |
| |
| - **Business Information**: Name, description, location, contact, hours |
| - **Product Inventory**: Products with pricing, specifications, inventory |
| - **Service Inventory**: Services with pricing, itineraries, FAQs |
| - **Data Provenance**: Track where each field came from |
| |
| ### How It Works: |
| |
| 1. Upload business documents (PDFs, DOCX, images) |
| 2. Run processing pipeline (6 agents) |
| 3. Click "Generate Business Profile with AI" |
| 4. Groq AI extracts and structures the information |
| 5. View results here! |
| """) |
| else: |
| profile = st.session_state.business_profile |
| |
| |
| business_type = profile.get('business_type', 'unknown') |
| type_emoji = "πͺ" if business_type == 'product' else "πΌ" if business_type == 'service' else "π’" |
| st.markdown(f"### {type_emoji} Business Type: **{business_type.upper()}**") |
| |
| |
| profile_json = json.dumps( |
| {k: v for k, v in profile.items() if not str(k).startswith('_')}, |
| indent=2, ensure_ascii=False, default=str |
| ) |
| st.download_button( |
| label="π₯ Download Profile JSON", |
| data=profile_json, |
| file_name=f"business_profile_{st.session_state.job_id}.json", |
| mime="application/json" |
| ) |
| |
| st.divider() |
| |
| |
| st.subheader("π Business Information") |
| business_info = profile.get('business_info', {}) |
| |
| col1, col2 = st.columns(2) |
| with col1: |
| if business_info.get('name'): |
| st.markdown(f"**Name:** {business_info['name']}") |
| if business_info.get('description'): |
| st.markdown(f"**Description:** {business_info['description']}") |
| if business_info.get('category'): |
| st.markdown(f"**Category:** {business_info['category']}") |
| |
| with col2: |
| location = business_info.get('location', {}) |
| if location: |
| st.markdown("**Location:**") |
| if location.get('address'): |
| st.markdown(f" - Address: {location['address']}") |
| if location.get('city'): |
| st.markdown(f" - City: {location['city']}") |
| if location.get('state'): |
| st.markdown(f" - State: {location['state']}") |
| |
| |
| contact = business_info.get('contact', {}) |
| if contact: |
| st.markdown("**Contact:**") |
| col_a, col_b = st.columns(2) |
| with col_a: |
| if contact.get('phone'): |
| st.markdown(f" π Phone: {contact['phone']}") |
| if contact.get('email'): |
| st.markdown(f" π§ Email: {contact['email']}") |
| with col_b: |
| if contact.get('website'): |
| st.markdown(f" π Website: {contact['website']}") |
| |
| st.divider() |
| |
| |
| products = profile.get('products', []) |
| if products: |
| st.subheader(f"π¦ Products ({len(products)})") |
| for i, product in enumerate(products, 1): |
| with st.expander(f"**{i}. {product.get('name', 'Product')}**"): |
| st.write(f"**Description:** {product.get('description', 'N/A')}") |
| if product.get('pricing'): |
| pricing = product['pricing'] |
| st.write(f"**Price:** {pricing.get('base_price', 'N/A')} {pricing.get('currency', 'USD')}") |
| if product.get('specifications'): |
| st.write("**Specifications:**") |
| for key, value in product['specifications'].items(): |
| if value: |
| st.write(f" - {key}: {value}") |
| |
| st.divider() |
| |
| |
| services = profile.get('services', []) |
| if services: |
| st.subheader(f"πΌ Services ({len(services)})") |
| |
| |
| st.markdown("**Service Completeness:**") |
| for i, service in enumerate(services): |
| filled = 0 |
| total = 13 |
| for field in ['name', 'description', 'category', 'pricing', 'details', |
| 'itinerary', 'inclusions', 'exclusions', 'cancellation_policy', |
| 'payment_policy', 'travel_info', 'faqs', 'tags']: |
| val = service.get(field) |
| if val and (not isinstance(val, (list, dict)) or len(val) > 0): |
| filled += 1 |
| pct = int(filled / total * 100) |
| st.progress(pct / 100, text=f"{service.get('name', f'Service {i+1}')}: {pct}% ({filled}/{total} fields)") |
| |
| st.divider() |
| |
| |
| for i, service in enumerate(services, 1): |
| svc_name = service.get('name', f'Service {i}') |
| with st.expander(f"ποΈ **{i}. {svc_name}**", expanded=(i == 1)): |
| |
| |
| st.markdown("#### π Basic Information") |
| col1, col2 = st.columns(2) |
| with col1: |
| st.markdown(f"**Name:** {svc_name}") |
| st.markdown(f"**Category:** {service.get('category', 'N/A')}") |
| with col2: |
| if service.get('description'): |
| st.markdown(f"**Description:** {service['description']}") |
| |
| |
| pricing = service.get('pricing') |
| if pricing and isinstance(pricing, dict): |
| st.markdown("#### π° Pricing") |
| pcol1, pcol2, pcol3 = st.columns(3) |
| with pcol1: |
| bp = pricing.get('base_price') |
| curr = pricing.get('currency', 'INR') |
| st.metric("Base Price", f"{curr} {bp}" if bp else "N/A") |
| with pcol2: |
| st.markdown(f"**Price Type:** {pricing.get('price_type', 'N/A')}") |
| with pcol3: |
| dp = pricing.get('discount_price') |
| if dp: |
| st.metric("Discount Price", f"{curr} {dp}") |
| |
| |
| details = service.get('details') |
| if details and isinstance(details, dict): |
| st.markdown("#### ποΈ Trek Details") |
| dcol1, dcol2, dcol3 = st.columns(3) |
| with dcol1: |
| if details.get('duration'): |
| st.markdown(f"β±οΈ **Duration:** {details['duration']}") |
| if details.get('difficulty_level'): |
| diff = details['difficulty_level'] |
| diff_emoji = "π’" if 'easy' in diff.lower() else "π‘" if 'moderate' in diff.lower() else "π΄" |
| st.markdown(f"{diff_emoji} **Difficulty:** {diff}") |
| with dcol2: |
| if details.get('max_altitude'): |
| st.markdown(f"ποΈ **Max Altitude:** {details['max_altitude']}") |
| if details.get('total_distance'): |
| st.markdown(f"π **Distance:** {details['total_distance']}") |
| with dcol3: |
| if details.get('starting_point'): |
| st.markdown(f"π **Start:** {details['starting_point']}") |
| if details.get('ending_point'): |
| st.markdown(f"π **End:** {details['ending_point']}") |
| |
| if details.get('group_size'): |
| st.markdown(f"π₯ **Group Size:** {details['group_size']}") |
| if details.get('best_time'): |
| st.markdown(f"π
**Best Time:** {details['best_time']}") |
| |
| |
| itinerary = service.get('itinerary', []) |
| if itinerary and isinstance(itinerary, list) and len(itinerary) > 0: |
| st.markdown(f"#### ποΈ Day-by-Day Itinerary ({len(itinerary)} days)") |
| |
| for day_data in itinerary: |
| if isinstance(day_data, dict): |
| day_num = day_data.get('day', '?') |
| day_title = day_data.get('title', day_data.get('description', 'N/A')) |
| day_desc = day_data.get('description', '') |
| day_alt = day_data.get('altitude', '') |
| day_dist = day_data.get('distance', '') |
| |
| header = f"**Day {day_num}: {day_title}**" |
| if day_alt: |
| header += f" | ποΈ {day_alt}" |
| if day_dist: |
| header += f" | π {day_dist}" |
| |
| st.markdown(header) |
| if day_desc and day_desc != day_title: |
| st.caption(day_desc) |
| |
| |
| activities = day_data.get('activities', []) |
| if activities and isinstance(activities, list): |
| st.markdown(" " + " β ".join(activities)) |
| |
| |
| meals = day_data.get('meals', []) |
| if meals and isinstance(meals, list): |
| st.markdown(f" π½οΈ Meals: {', '.join(meals)}") |
| |
| |
| accommodation = day_data.get('accommodation') |
| if accommodation: |
| st.markdown(f" π Stay: {accommodation}") |
| else: |
| st.markdown("#### ποΈ Itinerary") |
| st.caption("No itinerary data extracted") |
| |
| |
| incl = service.get('inclusions', []) |
| excl = service.get('exclusions', []) |
| if incl or excl: |
| st.markdown("#### β
Inclusions & β Exclusions") |
| icol1, icol2 = st.columns(2) |
| with icol1: |
| if incl and isinstance(incl, list): |
| st.markdown("**β
Included:**") |
| for item in incl: |
| st.markdown(f" β {item}") |
| else: |
| st.caption("No inclusions data") |
| with icol2: |
| if excl and isinstance(excl, list): |
| st.markdown("**β Excluded:**") |
| for item in excl: |
| st.markdown(f" β {item}") |
| else: |
| st.caption("No exclusions data") |
| |
| |
| cancel_policy = service.get('cancellation_policy') |
| pay_policy = service.get('payment_policy') |
| if cancel_policy or pay_policy: |
| st.markdown("#### π Policies") |
| if cancel_policy: |
| st.markdown(f"**Cancellation Policy:** {cancel_policy}") |
| if pay_policy: |
| st.markdown(f"**Payment Policy:** {pay_policy}") |
| |
| |
| travel = service.get('travel_info') |
| if travel and isinstance(travel, dict) and any(travel.values()): |
| st.markdown("#### π Travel Information") |
| if travel.get('how_to_reach'): |
| st.markdown(f"**How to Reach:** {travel['how_to_reach']}") |
| tcol1, tcol2 = st.columns(2) |
| with tcol1: |
| if travel.get('nearest_railway'): |
| st.markdown(f"π **Railway:** {travel['nearest_railway']}") |
| with tcol2: |
| if travel.get('nearest_airport'): |
| st.markdown(f"βοΈ **Airport:** {travel['nearest_airport']}") |
| landmarks = travel.get('nearby_landmarks', []) |
| if landmarks and isinstance(landmarks, list): |
| st.markdown(f"π **Landmarks:** {', '.join(landmarks)}") |
| |
| |
| faqs = service.get('faqs', []) |
| if faqs and isinstance(faqs, list) and len(faqs) > 0: |
| st.markdown(f"#### β FAQs ({len(faqs)})") |
| for faq in faqs: |
| if isinstance(faq, dict): |
| st.markdown(f"**Q: {faq.get('question', 'N/A')}**") |
| st.markdown(f"A: {faq.get('answer', 'N/A')}") |
| |
| |
| carry = service.get('what_to_carry', []) |
| if carry and isinstance(carry, list) and len(carry) > 0: |
| st.markdown("#### π What to Carry") |
| ccol1, ccol2 = st.columns(2) |
| half = len(carry) // 2 + 1 |
| with ccol1: |
| for item in carry[:half]: |
| st.markdown(f" β’ {item}") |
| with ccol2: |
| for item in carry[half:]: |
| st.markdown(f" β’ {item}") |
| |
| |
| risk = service.get('risk_and_safety') |
| if risk: |
| st.markdown("#### β οΈ Risk & Safety") |
| st.warning(risk) |
| |
| |
| tags = service.get('tags', []) |
| if tags and isinstance(tags, list): |
| st.markdown("#### π·οΈ Tags") |
| st.markdown(" ".join([f"`{tag}`" for tag in tags])) |
| else: |
| st.info("No services extracted") |
| |
| st.divider() |
| |
| |
| st.subheader("π Extraction Metadata") |
| metadata = profile.get('extraction_metadata', {}) |
| col1, col2, col3, col4 = st.columns(4) |
| with col1: |
| st.metric("Processing Time", f"{metadata.get('processing_time', 0):.2f}s") |
| with col2: |
| st.metric("Source Files", metadata.get('source_files_count', 0)) |
| with col3: |
| st.metric("Confidence", f"{metadata.get('confidence_score', 0):.0%}") |
| with col4: |
| st.metric("LLM Calls", metadata.get('llm_calls_made', 0)) |
| |
| st.markdown(f"**Method:** {metadata.get('extraction_method', 'unknown')}") |
| st.markdown(f"**Version:** {metadata.get('version', '1.0')}") |
| |
| |
| with st.expander("π View Raw Profile JSON"): |
| st.json(profile) |
|
|
| |
| st.divider() |
| st.markdown(""" |
| <div style="text-align: center; color: #666; padding: 1rem;"> |
| <b>Digi-Biz</b> - Agentic Business Digitization Framework<br> |
| Powered by Groq Vision (Llama-4-Scout) β’ Ollama Fallback β’ Multi-Agent Pipeline |
| </div> |
| """, unsafe_allow_html=True) |
|
|