Digi-Biz / app.py
Deployment Bot
Automated deployment to Hugging Face
255cbd1
"""
Digi-Biz: Agentic Business Digitization Framework
Streamlit Demo Application
This app demonstrates the complete workflow:
1. Upload ZIP with business documents
2. File Discovery Agent extracts and classifies files
3. Document Parsing Agent extracts text and tables
4. Media Extraction Agent extracts images
5. Vision Agent (Groq Llama-4-Scout) analyzes images
6. View results
"""
import streamlit as st
import os
import tempfile
import shutil
from pathlib import Path
from datetime import datetime
import json
import io
from PIL import Image
from backend.utils.logger import get_logger
logger = get_logger(__name__)
# Load environment variables from .env file
from dotenv import load_dotenv
env_path = Path(__file__).parent / ".env"
if env_path.exists():
load_dotenv(env_path)
# Import Groq to verify it's available
try:
from groq import Groq
GROQ_AVAILABLE = True
except ImportError:
GROQ_AVAILABLE = False
# Import agents
from backend.agents.file_discovery import FileDiscoveryAgent, FileDiscoveryInput
from backend.agents.document_parsing import DocumentParsingAgent, DocumentParsingInput
from backend.agents.table_extraction import TableExtractionAgent, TableExtractionInput
from backend.agents.media_extraction import MediaExtractionAgent, MediaExtractionInput
from backend.agents.vision_agent import VisionAgent, VisionAnalysisInput
from backend.agents.indexing import IndexingAgent, IndexingInput
from backend.utils.storage_manager import StorageManager
# =============================================================================
# Streamlit Configuration
# =============================================================================
st.set_page_config(
page_title="Digi-Biz - Business Digitization",
page_icon="πŸ“„",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
font-weight: bold;
color: #1E88E5;
text-align: center;
margin-bottom: 1rem;
}
.sub-header {
font-size: 1.2rem;
color: #666;
text-align: center;
margin-bottom: 2rem;
}
.success-box {
padding: 1rem;
border-radius: 0.5rem;
background-color: #E8F5E9;
border-left: 4px solid #4CAF50;
margin: 1rem 0;
}
.info-box {
padding: 1rem;
border-radius: 0.5rem;
background-color: #E3F2FD;
border-left: 4px solid #2196F3;
margin: 1rem 0;
}
.agent-card {
padding: 1rem;
border-radius: 0.5rem;
background-color: #f5f5f5;
margin: 0.5rem 0;
}
</style>
""", unsafe_allow_html=True)
# =============================================================================
# Session State Initialization
# =============================================================================
if 'job_id' not in st.session_state:
st.session_state.job_id = ""
if 'discovery_output' not in st.session_state:
st.session_state.discovery_output = None
if 'parsing_output' not in st.session_state:
st.session_state.parsing_output = None
if 'tables_output' not in st.session_state:
st.session_state.tables_output = None
if 'media_output' not in st.session_state:
st.session_state.media_output = None
if 'vision_output' not in st.session_state:
st.session_state.vision_output = None
if 'processing_complete' not in st.session_state:
st.session_state.processing_complete = False
# =============================================================================
# Helper Functions
# =============================================================================
def generate_job_id():
"""Generate unique job ID"""
return f"job_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
def cleanup_temp_dirs():
"""Clean up temporary directories"""
temp_base = Path(tempfile.gettempdir()) / "digi_biz"
if temp_base.exists():
shutil.rmtree(temp_base)
def get_model_status():
"""Check if Ollama and Qwen model are available"""
try:
from ollama import Client
client = Client(host='http://localhost:11434', timeout=5)
response = client.list()
if isinstance(response, dict) and 'models' in response:
models = [m['name'] for m in response['models']]
elif hasattr(response, 'models'):
models = [m.name if hasattr(m, 'name') else m['name'] for m in response.models]
else:
models = []
ollama_ok = True
qwen_available = any('qwen3.5' in m for m in models)
# Test actual vision capability
vision_working = False
if qwen_available:
try:
# Quick vision test
test_client = Client(host='http://localhost:11434', timeout=30)
test_img = Image.new('RGB', (50, 50), color='red')
test_bytes = io.BytesIO()
test_img.save(test_bytes, format='PNG')
test_response = test_client.chat(
model='qwen3.5:0.8b',
messages=[{
'role': 'user',
'content': 'What color?',
'images': [test_bytes.getvalue()]
}],
options={'timeout': 20000}
)
vision_working = len(test_response['message']['content'].strip()) > 10
except Exception:
vision_working = False
return ollama_ok, qwen_available, vision_working, models
except Exception:
return False, False, False, []
# =============================================================================
# Main App
# =============================================================================
# Header
st.markdown('<h1 class="main-header">πŸ“„ Digi-Biz</h1>', unsafe_allow_html=True)
st.markdown('<p class="sub-header">Agentic Business Digitization Framework</p>', unsafe_allow_html=True)
# Sidebar
with st.sidebar:
st.header("πŸ”§ Configuration")
# Model status
st.subheader("Model Status")
# Check Groq API
groq_ok = False
groq_model = "N/A"
groq_error = ""
try:
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
groq_error = "GROQ_API_KEY not set in .env"
elif api_key == "gsk_YOUR_API_KEY_HERE":
groq_error = "Using placeholder key"
else:
# Try to create client
client = Groq(api_key=api_key, timeout=5)
models = client.models.list()
groq_ok = True
groq_model = "llama-4-scout-17b"
except ImportError:
groq_error = "groq package not installed"
except Exception as e:
groq_error = str(e)[:100]
if groq_ok:
st.success(f"βœ“ Groq API: {groq_model}")
else:
st.error("βœ— Groq API Not Available")
st.code(groq_error, language=None)
st.info("Fix: Get key from https://console.groq.com and add to .env file")
# Check Ollama (fallback)
ollama_ok = False
try:
from ollama import Client
client = Client(host='http://localhost:11434', timeout=5)
client.list()
ollama_ok = True
except Exception:
pass
if ollama_ok:
st.success("βœ“ Ollama: Fallback Ready")
else:
st.warning("⚠ Ollama: Not Running (optional)")
st.divider()
# Agent status
st.subheader("Agents")
st.markdown("""
<div class="agent-card">
<b>1. File Discovery</b><br>
<small>Extracts & classifies files from ZIP</small>
</div>
<div class="agent-card">
<b>2. Document Parsing</b><br>
<small>Extracts text from PDF/DOCX</small>
</div>
<div class="agent-card">
<b>3. Table Extraction</b><br>
<small>Detects & classifies tables</small>
</div>
<div class="agent-card">
<b>4. Media Extraction</b><br>
<small>Extracts embedded images</small>
</div>
<div class="agent-card">
<b>5. Vision Agent</b><br>
<small>Analyzes images with Groq</small>
</div>
<div class="agent-card">
<b>6. Indexing Agent</b><br>
<small>Builds RAG search index</small>
</div>
""", unsafe_allow_html=True)
st.divider()
# Reset button
if st.button("πŸ”„ Reset All", use_container_width=True):
cleanup_temp_dirs()
for key in list(st.session_state.keys()):
st.session_state[key] = None
st.session_state.processing_complete = False
st.rerun()
# Main content area
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["πŸ“€ Upload", "βš™οΈ Processing", "πŸ“Š Results", "πŸ–ΌοΈ Vision Analysis", "🌳 Index Tree", "πŸ“„ Business Profile"])
with tab1:
st.header("Upload Business Documents")
st.markdown("""
**Supported Formats:**
- πŸ“„ Documents: PDF, DOCX, DOC
- πŸ“Š Spreadsheets: XLSX, XLS, CSV
- πŸ–ΌοΈ Images: JPG, PNG, GIF, WEBP
- πŸŽ₯ Videos: MP4, AVI, MOV
**Instructions:**
1. Create a ZIP file with your business documents
2. Upload using the file uploader below
3. Click "Start Processing"
""")
uploaded_file = st.file_uploader(
"Upload ZIP file",
type=['zip'],
help="Select a ZIP file containing business documents"
)
if uploaded_file:
st.success(f"βœ“ Uploaded: {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
# Save to temp location
temp_dir = Path(tempfile.gettempdir()) / "digi_biz" / generate_job_id()
temp_dir.mkdir(parents=True, exist_ok=True)
zip_path = temp_dir / uploaded_file.name
with open(zip_path, 'wb') as f:
f.write(uploaded_file.getvalue())
st.session_state.zip_path = str(zip_path)
st.session_state.job_id = temp_dir.name
st.info(f"Job ID: `{st.session_state.job_id}`")
# Start processing button
if st.button("πŸš€ Start Processing", type="primary", use_container_width=True):
st.session_state.processing_started = True
st.rerun()
with tab2:
st.header("Processing Pipeline")
if not hasattr(st.session_state, 'processing_started') or not st.session_state.processing_started:
st.info("πŸ‘† Upload a ZIP file and click 'Start Processing'")
st.stop()
progress_bar = st.progress(0)
status_text = st.empty()
# Step 1: File Discovery
status_text.text("Step 1/5: File Discovery Agent...")
try:
storage_manager = StorageManager(storage_base=str(Path(tempfile.gettempdir()) / "digi_biz" / st.session_state.job_id))
discovery_agent = FileDiscoveryAgent(storage_manager=storage_manager)
discovery_input = FileDiscoveryInput(
zip_file_path=st.session_state.zip_path,
job_id=st.session_state.job_id
)
st.session_state.discovery_output = discovery_agent.discover(discovery_input)
progress_bar.progress(20)
if st.session_state.discovery_output.success:
st.success(f"βœ“ File Discovery Complete: {st.session_state.discovery_output.total_files} files")
st.markdown(f"""
<div class="success-box">
<b>Summary:</b><br>
β€’ Documents: {st.session_state.discovery_output.summary.get('documents_count', 0)}<br>
β€’ Spreadsheets: {st.session_state.discovery_output.summary.get('spreadsheets_count', 0)}<br>
β€’ Images: {st.session_state.discovery_output.summary.get('images_count', 0)}<br>
β€’ Videos: {st.session_state.discovery_output.summary.get('videos_count', 0)}
</div>
""", unsafe_allow_html=True)
else:
st.error(f"βœ— File Discovery Failed: {st.session_state.discovery_output.errors}")
st.stop()
except Exception as e:
st.error(f"File Discovery Error: {str(e)}")
st.stop()
# Step 2: Document Parsing
status_text.text("Step 2/5: Document Parsing Agent...")
try:
parsing_agent = DocumentParsingAgent(enable_ocr=False)
parsing_input = DocumentParsingInput(
documents=st.session_state.discovery_output.documents,
job_id=st.session_state.job_id,
enable_ocr=False
)
st.session_state.parsing_output = parsing_agent.parse(parsing_input)
progress_bar.progress(40)
if st.session_state.parsing_output.success:
st.success(f"βœ“ Document Parsing Complete: {st.session_state.parsing_output.total_pages} pages")
else:
st.warning("⚠ Document Parsing: No documents to parse")
except Exception as e:
st.warning(f"Document Parsing: {str(e)}")
# Step 3: Table Extraction
status_text.text("Step 3/5: Table Extraction Agent...")
try:
table_agent = TableExtractionAgent()
table_input = TableExtractionInput(
parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [],
job_id=st.session_state.job_id
)
st.session_state.tables_output = table_agent.extract(table_input)
progress_bar.progress(60)
if st.session_state.tables_output.success:
st.success(f"βœ“ Table Extraction Complete: {st.session_state.tables_output.total_tables} tables")
if st.session_state.tables_output.tables_by_type:
types_str = ", ".join([f"{k}: {v}" for k, v in st.session_state.tables_output.tables_by_type.items()])
st.info(f"Types: {types_str}")
else:
st.warning("⚠ Table Extraction: No tables found")
except Exception as e:
st.warning(f"Table Extraction: {str(e)}")
# Step 4: Media Extraction
status_text.text("Step 4/5: Media Extraction Agent...")
try:
media_agent = MediaExtractionAgent(enable_deduplication=True)
media_input = MediaExtractionInput(
parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [],
standalone_files=[img.file_path for img in st.session_state.discovery_output.images] if st.session_state.discovery_output else [],
job_id=st.session_state.job_id
)
st.session_state.media_output = media_agent.extract_all(media_input)
progress_bar.progress(80)
if st.session_state.media_output.success:
st.success(f"βœ“ Media Extraction Complete: {st.session_state.media_output.total_images} images")
if st.session_state.media_output.duplicates_removed > 0:
st.info(f"Removed {st.session_state.media_output.duplicates_removed} duplicates")
else:
st.warning("⚠ Media Extraction: No images found")
except Exception as e:
st.warning(f"Media Extraction: {str(e)}")
# Step 5: Vision Analysis
status_text.text("Step 5/5: Vision Agent (Groq Llama-4-Scout)...")
try:
# Initialize Vision Agent with Groq provider
from backend.agents.vision_agent import VisionAgent
vision_agent = VisionAgent(provider="groq", timeout=120)
# Check if we have images to analyze
images_to_analyze = []
if st.session_state.media_output and st.session_state.media_output.success:
images_to_analyze = st.session_state.media_output.media.images[:5] # Analyze first 5 images
if images_to_analyze:
st.info(f"Analyzing {len(images_to_analyze)} images with Groq Vision (Llama-4-Scout)...")
progress_vision = st.progress(0)
try:
# Analyze images
analyses = vision_agent.analyze_batch(images_to_analyze)
st.session_state.vision_output = analyses
progress_vision.progress(100)
st.success(f"βœ“ Vision Analysis Complete: {len(analyses)} images analyzed")
# Show quick summary
if analyses:
categories = {}
for a in analyses:
cat = a.category.value
categories[cat] = categories.get(cat, 0) + 1
st.markdown("**Categories Detected:**")
cat_text = ", ".join([f"{k}: {v}" for k, v in categories.items()])
st.info(cat_text)
except Exception as ve:
st.warning(f"Vision analysis failed: {str(ve)}")
st.info("Falling back to Ollama...")
# Try Ollama fallback
try:
vision_agent_ollama = VisionAgent(provider="ollama", timeout=120)
analyses = vision_agent_ollama.analyze_batch(images_to_analyze)
st.session_state.vision_output = analyses
st.success(f"βœ“ Vision Analysis Complete (via Ollama): {len(analyses)} images")
except Exception as e2:
st.session_state.vision_output = None
st.error(f"All vision providers failed: {e2}")
else:
st.session_state.vision_output = None
st.warning("⚠ Vision Analysis: No images to analyze")
# Step 6: Indexing (RAG)
status_text.text("Step 6/6: Building Search Index (RAG)...")
try:
indexing_agent = IndexingAgent()
# Prepare indexing input
all_images = []
if st.session_state.media_output and st.session_state.media_output.success:
all_images = st.session_state.media_output.media.images
indexing_input = IndexingInput(
parsed_documents=st.session_state.parsing_output.parsed_documents if st.session_state.parsing_output else [],
tables=st.session_state.tables_output.tables if st.session_state.tables_output else [],
images=all_images,
job_id=st.session_state.job_id
)
# Build index
page_index = indexing_agent.build_index(indexing_input)
# Store in session state (convert Pydantic model to dict for serialization)
st.session_state.page_index_dict = page_index.model_dump(mode='json')
st.session_state.page_index_has_data = True
st.success(f"βœ“ Index Built: {page_index.metadata.get('total_keywords', 0)} keywords")
except Exception as e:
st.warning(f"Indexing failed: {str(e)}")
st.session_state.page_index_dict = None
st.session_state.page_index_has_data = False
progress_bar.progress(100)
status_text.text("βœ“ Processing Complete!")
st.session_state.processing_complete = True
except Exception as e:
st.warning(f"Processing error: {str(e)}")
st.session_state.processing_complete = False
# Step 7: Schema Mapping (optional - for future)
# TODO: Add schema mapping button in Results tab
with tab3:
st.header("Processing Results")
if not st.session_state.processing_complete:
st.info("⏳ Processing not complete yet. Go to 'Processing' tab.")
st.stop()
# Generate Business Profile Button
st.subheader("🎯 Generate Business Profile")
st.markdown("Use AI to create a structured business profile from extracted data")
if st.button("πŸš€ Generate Business Profile with AI", type="primary", use_container_width=True):
with st.spinner("Generating business profile with Groq AI... Processing each document individually (1-2 minutes)"):
try:
from backend.agents.schema_mapping_simple import SchemaMappingAgent
from backend.models.schemas import SchemaMappingInput
from backend.agents.validation_agent import ValidationAgent
from backend.models.schemas import ValidationInput as ValidationInputSchema
# Get page index
if not st.session_state.get('page_index_dict'):
st.error("No index available. Please run processing first.")
else:
from backend.models.schemas import PageIndex
page_index = PageIndex.model_validate(st.session_state.page_index_dict)
# Step 1: Schema Mapping
with st.status("Running Schema Mapping Agent...", expanded=True) as status:
agent = SchemaMappingAgent()
input_data = SchemaMappingInput(
page_index=page_index,
job_id=st.session_state.job_id
)
mapping_output = agent.map_to_schema(input_data)
if mapping_output.success and mapping_output.profile:
st.success("βœ… Schema mapping complete!")
status.update(label="Schema Mapping Complete", state="complete")
else:
st.warning(f"⚠️ Schema mapping had issues: {mapping_output.errors}")
status.update(label="Schema Mapping Complete (with warnings)", state="complete")
# Step 2: Validation
with st.status("Running Validation Agent...", expanded=True) as status:
validation_agent = ValidationAgent()
validation_input = ValidationInputSchema(
profile=mapping_output.profile,
job_id=st.session_state.job_id
)
validation_output = validation_agent.validate(validation_input)
st.session_state.validation_result = validation_output.model_dump(mode='json')
if validation_output.is_valid:
st.success(f"βœ… Validation passed! Completeness: {validation_output.completeness_score:.0%}")
status.update(label="Validation Complete", state="complete")
else:
st.warning(f"⚠️ Validation found {len(validation_output.errors)} errors")
status.update(label="Validation Complete (errors found)", state="complete")
# Store profile
if mapping_output.profile:
st.session_state.business_profile = mapping_output.profile.model_dump(mode='json')
st.success("βœ… Business Profile Generated Successfully!")
st.info("Go to 'Business Profile' tab to view results")
else:
st.error("Failed to generate profile")
except Exception as e:
st.error(f"Error generating profile: {str(e)}")
logger.error(f"Schema mapping failed: {e}")
st.divider()
# File Discovery Results
st.subheader("πŸ“ File Discovery")
if st.session_state.discovery_output:
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Documents", st.session_state.discovery_output.summary.get('documents_count', 0))
with col2:
st.metric("Spreadsheets", st.session_state.discovery_output.summary.get('spreadsheets_count', 0))
with col3:
st.metric("Images", st.session_state.discovery_output.summary.get('images_count', 0))
with col4:
st.metric("Videos", st.session_state.discovery_output.summary.get('videos_count', 0))
# File list
with st.expander("πŸ“‹ View File List"):
if st.session_state.discovery_output.documents:
st.write("**Documents:**")
for doc in st.session_state.discovery_output.documents:
st.write(f"- {doc.original_name} ({doc.file_type.value})")
# Document Parsing Results
st.subheader("πŸ“„ Document Parsing")
if st.session_state.parsing_output and st.session_state.parsing_output.success:
col1, col2 = st.columns(2)
with col1:
st.metric("Pages", st.session_state.parsing_output.total_pages)
with col2:
st.metric("Processing Time", f"{st.session_state.parsing_output.processing_time:.1f}s")
# Show extracted text from first document
with st.expander("πŸ“ View Extracted Text"):
if st.session_state.parsing_output.parsed_documents:
doc = st.session_state.parsing_output.parsed_documents[0]
st.write(f"**Source:** {doc.source_file}")
st.write(f"**Pages:** {doc.total_pages}")
if doc.pages and doc.pages[0].text:
st.text_area("Text content", doc.pages[0].text[:1000], height=300)
# Table Extraction Results
st.subheader("πŸ“Š Table Extraction")
if st.session_state.tables_output and st.session_state.tables_output.success:
col1, col2 = st.columns(2)
with col1:
st.metric("Tables Found", st.session_state.tables_output.total_tables)
with col2:
st.metric("By Type", str(st.session_state.tables_output.tables_by_type))
# Show tables
with st.expander("πŸ“‹ View Tables"):
for i, table in enumerate(st.session_state.tables_output.tables):
st.write(f"**Table {i+1}:** {table.table_type.value}")
st.write(f"Source: {table.source_doc}, Page: {table.source_page}")
if table.headers:
st.write(f"Headers: {', '.join(table.headers)}")
with tab4:
st.header("πŸ–ΌοΈ Vision Analysis (Groq Llama-4-Scout)")
if not st.session_state.processing_complete:
st.info("⏳ Processing not complete yet.")
st.stop()
if not st.session_state.vision_output:
st.warning("⚠ No vision analysis available. Either no images were found or analysis failed.")
st.stop()
# Show analyzed images
for i, analysis in enumerate(st.session_state.vision_output):
st.divider()
col1, col2 = st.columns([1, 2])
with col1:
# Find corresponding image
if st.session_state.media_output:
for img in st.session_state.media_output.media.images:
if img.image_id == analysis.image_id:
try:
st.image(img.file_path, caption=analysis.image_id, use_container_width=True)
except Exception:
st.write(f"Image: {analysis.image_id}")
break
with col2:
st.subheader(f"Analysis {i+1}")
# Category badge - handle both str and enum
category_value = analysis.category
if hasattr(analysis.category, 'value'):
category_value = analysis.category.value
elif isinstance(analysis.category, str):
category_value = analysis.category.lower()
category_colors = {
'product': 'πŸ”΅',
'service': '🟒',
'food': '🟠',
'destination': '🟣',
'person': 'πŸ”΄',
'document': 'βšͺ',
'logo': '🟑',
'other': '⚫'
}
category_emoji = category_colors.get(category_value, 'βšͺ')
st.markdown(f"**Category:** {category_emoji} {category_value}")
# Show provider and confidence
provider = analysis.metadata.get('provider', 'unknown')
provider_icon = "πŸš€" if provider == 'groq' else "πŸ¦™"
st.markdown(f"**Provider:** {provider_icon} {provider.upper()}")
st.markdown(f"**Confidence:** {analysis.confidence:.0%}")
# Description
if analysis.description:
st.markdown(f"**Description:** {analysis.description}")
# Tags
if analysis.tags:
st.markdown(f"**Tags:** {', '.join(analysis.tags)}")
# Product/Service flags
col_a, col_b = st.columns(2)
with col_a:
if analysis.is_product:
st.success("βœ“ Product")
with col_b:
if analysis.is_service_related:
st.info("βœ“ Service-related")
# Associations
if analysis.suggested_associations:
st.markdown(f"**Associations:** {', '.join(analysis.suggested_associations)}")
# Processing time
proc_time = analysis.metadata.get('processing_time', 0)
st.caption(f"Processed in {proc_time:.2f}s")
with tab5:
st.header("🌳 PageIndex Tree Structure")
if not st.session_state.processing_complete:
st.info("⏳ Processing not complete yet.")
st.stop()
if not st.session_state.get('page_index_has_data') or not st.session_state.get('page_index_dict'):
st.warning("⚠ No index available. Run processing first.")
st.stop()
# Reconstruct PageIndex from dict
from backend.models.schemas import PageIndex
page_index = PageIndex.model_validate(st.session_state.page_index_dict)
# Index Statistics
st.subheader("πŸ“Š Index Statistics")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Keywords", page_index.metadata.get('total_keywords', 0))
with col2:
# Count tree nodes from documents if tree_root is None
tree_node_count = 0
if page_index.tree_root:
tree_node_count = page_index.metadata.get('total_tree_nodes', 0)
elif page_index.documents:
tree_node_count = len(page_index.documents)
st.metric("Tree Nodes", tree_node_count)
with col3:
st.metric("Build Time", f"{page_index.metadata.get('build_time_seconds', 0):.2f}s")
st.divider()
# Tree Visualization - Show documents if tree_root is None
st.subheader("🌲 Document Tree")
if page_index.tree_root and page_index.tree_root.children:
# Display tree structure
def display_tree_node(node, level=0):
"""Recursively display tree node"""
indent = " " * level
# Display node
if level == 0:
st.markdown(f"{indent}**πŸ“ {node.title}**")
else:
st.markdown(f"{indent}πŸ“„ {node.title}")
# Show details
if node.keywords:
keywords_str = ", ".join(node.keywords[:10]) # Show first 10
if len(node.keywords) > 10:
keywords_str += f" ... and {len(node.keywords) - 10} more"
st.markdown(f"{indent}**Keywords:** {keywords_str}")
if node.start_page and node.end_page:
st.markdown(f"{indent}**Pages:** {node.start_page}-{node.end_page}")
# Display children
if node.children:
for child in node.children:
display_tree_node(child, level + 1)
display_tree_node(page_index.tree_root)
elif page_index.documents:
# Fallback: Display documents directly
st.info(f"πŸ“„ Displaying {len(page_index.documents)} documents")
for doc_id, doc in page_index.documents.items():
st.markdown(f"**πŸ“„ {os.path.basename(doc.source_file)}**")
st.markdown(f" - **Pages:** {doc.total_pages}")
st.markdown(f" - **Type:** {doc.file_type.value}")
st.divider()
else:
st.warning("⚠ No documents in index")
# Keyword Search
st.subheader("πŸ” Keyword Search")
search_query = st.text_input("Search keywords:", placeholder="e.g., burger, price, menu")
if search_query and page_index.page_index:
if search_query.lower() in page_index.page_index:
refs = page_index.page_index[search_query.lower()]
st.markdown(f"**Found '{search_query}' in {len(refs)} location(s):**")
for ref in refs[:5]: # Show first 5
st.markdown(f"- πŸ“„ Document: `{ref.doc_id}`, Page {ref.page_number}")
if ref.snippet:
st.markdown(f" > {ref.snippet[:200]}")
else:
st.info(f"Keyword '{search_query}' not found in index")
# Raw Index Data (collapsible)
with st.expander("πŸ“‹ View Raw Index Data"):
st.json({
'total_keywords': page_index.metadata.get('total_keywords', 0),
'total_tree_nodes': page_index.metadata.get('total_tree_nodes', 0),
'sample_keywords': list(page_index.page_index.keys())[:50] if page_index.page_index else []
})
with tab6:
st.header("πŸ“„ Business Profile")
if not st.session_state.get('business_profile'):
st.info("πŸ‘† Click 'Generate Business Profile with AI' in the Results tab to create a business profile")
st.markdown("""
### What is a Business Profile?
A structured digital profile containing:
- **Business Information**: Name, description, location, contact, hours
- **Product Inventory**: Products with pricing, specifications, inventory
- **Service Inventory**: Services with pricing, itineraries, FAQs
- **Data Provenance**: Track where each field came from
### How It Works:
1. Upload business documents (PDFs, DOCX, images)
2. Run processing pipeline (6 agents)
3. Click "Generate Business Profile with AI"
4. Groq AI extracts and structures the information
5. View results here!
""")
else:
profile = st.session_state.business_profile
# Business Type Badge
business_type = profile.get('business_type', 'unknown')
type_emoji = "πŸͺ" if business_type == 'product' else "πŸ’Ό" if business_type == 'service' else "🏒"
st.markdown(f"### {type_emoji} Business Type: **{business_type.upper()}**")
# Download JSON button
profile_json = json.dumps(
{k: v for k, v in profile.items() if not str(k).startswith('_')},
indent=2, ensure_ascii=False, default=str
)
st.download_button(
label="πŸ“₯ Download Profile JSON",
data=profile_json,
file_name=f"business_profile_{st.session_state.job_id}.json",
mime="application/json"
)
st.divider()
# Business Info
st.subheader("πŸ“Š Business Information")
business_info = profile.get('business_info', {})
col1, col2 = st.columns(2)
with col1:
if business_info.get('name'):
st.markdown(f"**Name:** {business_info['name']}")
if business_info.get('description'):
st.markdown(f"**Description:** {business_info['description']}")
if business_info.get('category'):
st.markdown(f"**Category:** {business_info['category']}")
with col2:
location = business_info.get('location', {})
if location:
st.markdown("**Location:**")
if location.get('address'):
st.markdown(f" - Address: {location['address']}")
if location.get('city'):
st.markdown(f" - City: {location['city']}")
if location.get('state'):
st.markdown(f" - State: {location['state']}")
# Contact Info
contact = business_info.get('contact', {})
if contact:
st.markdown("**Contact:**")
col_a, col_b = st.columns(2)
with col_a:
if contact.get('phone'):
st.markdown(f" πŸ“ž Phone: {contact['phone']}")
if contact.get('email'):
st.markdown(f" πŸ“§ Email: {contact['email']}")
with col_b:
if contact.get('website'):
st.markdown(f" 🌐 Website: {contact['website']}")
st.divider()
# Products
products = profile.get('products', [])
if products:
st.subheader(f"πŸ“¦ Products ({len(products)})")
for i, product in enumerate(products, 1):
with st.expander(f"**{i}. {product.get('name', 'Product')}**"):
st.write(f"**Description:** {product.get('description', 'N/A')}")
if product.get('pricing'):
pricing = product['pricing']
st.write(f"**Price:** {pricing.get('base_price', 'N/A')} {pricing.get('currency', 'USD')}")
if product.get('specifications'):
st.write("**Specifications:**")
for key, value in product['specifications'].items():
if value:
st.write(f" - {key}: {value}")
st.divider()
# ============== SERVICES (COMPREHENSIVE DISPLAY) ==============
services = profile.get('services', [])
if services:
st.subheader(f"πŸ’Ό Services ({len(services)})")
# Service completeness overview
st.markdown("**Service Completeness:**")
for i, service in enumerate(services):
filled = 0
total = 13
for field in ['name', 'description', 'category', 'pricing', 'details',
'itinerary', 'inclusions', 'exclusions', 'cancellation_policy',
'payment_policy', 'travel_info', 'faqs', 'tags']:
val = service.get(field)
if val and (not isinstance(val, (list, dict)) or len(val) > 0):
filled += 1
pct = int(filled / total * 100)
st.progress(pct / 100, text=f"{service.get('name', f'Service {i+1}')}: {pct}% ({filled}/{total} fields)")
st.divider()
# Render each service in detail
for i, service in enumerate(services, 1):
svc_name = service.get('name', f'Service {i}')
with st.expander(f"πŸ”οΈ **{i}. {svc_name}**", expanded=(i == 1)):
# --- Basic Info ---
st.markdown("#### πŸ“‹ Basic Information")
col1, col2 = st.columns(2)
with col1:
st.markdown(f"**Name:** {svc_name}")
st.markdown(f"**Category:** {service.get('category', 'N/A')}")
with col2:
if service.get('description'):
st.markdown(f"**Description:** {service['description']}")
# --- Pricing ---
pricing = service.get('pricing')
if pricing and isinstance(pricing, dict):
st.markdown("#### πŸ’° Pricing")
pcol1, pcol2, pcol3 = st.columns(3)
with pcol1:
bp = pricing.get('base_price')
curr = pricing.get('currency', 'INR')
st.metric("Base Price", f"{curr} {bp}" if bp else "N/A")
with pcol2:
st.markdown(f"**Price Type:** {pricing.get('price_type', 'N/A')}")
with pcol3:
dp = pricing.get('discount_price')
if dp:
st.metric("Discount Price", f"{curr} {dp}")
# --- Trek Details ---
details = service.get('details')
if details and isinstance(details, dict):
st.markdown("#### πŸ”οΈ Trek Details")
dcol1, dcol2, dcol3 = st.columns(3)
with dcol1:
if details.get('duration'):
st.markdown(f"⏱️ **Duration:** {details['duration']}")
if details.get('difficulty_level'):
diff = details['difficulty_level']
diff_emoji = "🟒" if 'easy' in diff.lower() else "🟑" if 'moderate' in diff.lower() else "πŸ”΄"
st.markdown(f"{diff_emoji} **Difficulty:** {diff}")
with dcol2:
if details.get('max_altitude'):
st.markdown(f"πŸ”οΈ **Max Altitude:** {details['max_altitude']}")
if details.get('total_distance'):
st.markdown(f"πŸ“ **Distance:** {details['total_distance']}")
with dcol3:
if details.get('starting_point'):
st.markdown(f"πŸ“ **Start:** {details['starting_point']}")
if details.get('ending_point'):
st.markdown(f"πŸ“ **End:** {details['ending_point']}")
if details.get('group_size'):
st.markdown(f"πŸ‘₯ **Group Size:** {details['group_size']}")
if details.get('best_time'):
st.markdown(f"πŸ“… **Best Time:** {details['best_time']}")
# --- Itinerary ---
itinerary = service.get('itinerary', [])
if itinerary and isinstance(itinerary, list) and len(itinerary) > 0:
st.markdown(f"#### πŸ—“οΈ Day-by-Day Itinerary ({len(itinerary)} days)")
for day_data in itinerary:
if isinstance(day_data, dict):
day_num = day_data.get('day', '?')
day_title = day_data.get('title', day_data.get('description', 'N/A'))
day_desc = day_data.get('description', '')
day_alt = day_data.get('altitude', '')
day_dist = day_data.get('distance', '')
header = f"**Day {day_num}: {day_title}**"
if day_alt:
header += f" | πŸ”οΈ {day_alt}"
if day_dist:
header += f" | πŸ“ {day_dist}"
st.markdown(header)
if day_desc and day_desc != day_title:
st.caption(day_desc)
# Show activities if present
activities = day_data.get('activities', [])
if activities and isinstance(activities, list):
st.markdown(" " + " β†’ ".join(activities))
# Show meals if present
meals = day_data.get('meals', [])
if meals and isinstance(meals, list):
st.markdown(f" 🍽️ Meals: {', '.join(meals)}")
# Show accommodation if present
accommodation = day_data.get('accommodation')
if accommodation:
st.markdown(f" 🏠 Stay: {accommodation}")
else:
st.markdown("#### πŸ—“οΈ Itinerary")
st.caption("No itinerary data extracted")
# --- Inclusions & Exclusions ---
incl = service.get('inclusions', [])
excl = service.get('exclusions', [])
if incl or excl:
st.markdown("#### βœ… Inclusions & ❌ Exclusions")
icol1, icol2 = st.columns(2)
with icol1:
if incl and isinstance(incl, list):
st.markdown("**βœ… Included:**")
for item in incl:
st.markdown(f" βœ“ {item}")
else:
st.caption("No inclusions data")
with icol2:
if excl and isinstance(excl, list):
st.markdown("**❌ Excluded:**")
for item in excl:
st.markdown(f" βœ— {item}")
else:
st.caption("No exclusions data")
# --- Policies ---
cancel_policy = service.get('cancellation_policy')
pay_policy = service.get('payment_policy')
if cancel_policy or pay_policy:
st.markdown("#### πŸ“œ Policies")
if cancel_policy:
st.markdown(f"**Cancellation Policy:** {cancel_policy}")
if pay_policy:
st.markdown(f"**Payment Policy:** {pay_policy}")
# --- Travel Info ---
travel = service.get('travel_info')
if travel and isinstance(travel, dict) and any(travel.values()):
st.markdown("#### πŸš‚ Travel Information")
if travel.get('how_to_reach'):
st.markdown(f"**How to Reach:** {travel['how_to_reach']}")
tcol1, tcol2 = st.columns(2)
with tcol1:
if travel.get('nearest_railway'):
st.markdown(f"πŸš† **Railway:** {travel['nearest_railway']}")
with tcol2:
if travel.get('nearest_airport'):
st.markdown(f"✈️ **Airport:** {travel['nearest_airport']}")
landmarks = travel.get('nearby_landmarks', [])
if landmarks and isinstance(landmarks, list):
st.markdown(f"πŸ“ **Landmarks:** {', '.join(landmarks)}")
# --- FAQs ---
faqs = service.get('faqs', [])
if faqs and isinstance(faqs, list) and len(faqs) > 0:
st.markdown(f"#### ❓ FAQs ({len(faqs)})")
for faq in faqs:
if isinstance(faq, dict):
st.markdown(f"**Q: {faq.get('question', 'N/A')}**")
st.markdown(f"A: {faq.get('answer', 'N/A')}")
# --- What to Carry ---
carry = service.get('what_to_carry', [])
if carry and isinstance(carry, list) and len(carry) > 0:
st.markdown("#### πŸŽ’ What to Carry")
ccol1, ccol2 = st.columns(2)
half = len(carry) // 2 + 1
with ccol1:
for item in carry[:half]:
st.markdown(f" β€’ {item}")
with ccol2:
for item in carry[half:]:
st.markdown(f" β€’ {item}")
# --- Risk & Safety ---
risk = service.get('risk_and_safety')
if risk:
st.markdown("#### ⚠️ Risk & Safety")
st.warning(risk)
# --- Tags ---
tags = service.get('tags', [])
if tags and isinstance(tags, list):
st.markdown("#### 🏷️ Tags")
st.markdown(" ".join([f"`{tag}`" for tag in tags]))
else:
st.info("No services extracted")
st.divider()
# Metadata
st.subheader("πŸ“‹ Extraction Metadata")
metadata = profile.get('extraction_metadata', {})
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Processing Time", f"{metadata.get('processing_time', 0):.2f}s")
with col2:
st.metric("Source Files", metadata.get('source_files_count', 0))
with col3:
st.metric("Confidence", f"{metadata.get('confidence_score', 0):.0%}")
with col4:
st.metric("LLM Calls", metadata.get('llm_calls_made', 0))
st.markdown(f"**Method:** {metadata.get('extraction_method', 'unknown')}")
st.markdown(f"**Version:** {metadata.get('version', '1.0')}")
# Raw JSON viewer
with st.expander("πŸ” View Raw Profile JSON"):
st.json(profile)
# Footer
st.divider()
st.markdown("""
<div style="text-align: center; color: #666; padding: 1rem;">
<b>Digi-Biz</b> - Agentic Business Digitization Framework<br>
Powered by Groq Vision (Llama-4-Scout) β€’ Ollama Fallback β€’ Multi-Agent Pipeline
</div>
""", unsafe_allow_html=True)