doctrace / app.py
jameszokah's picture
Initial commit
9243bca
import streamlit as st
import os
import streamlit.components.v1 as components
from datetime import datetime
# --- Try to import optional dependencies ---
try:
import google.generativeai as genai
GENAI_AVAILABLE = True
except ImportError:
GENAI_AVAILABLE = False
try:
import langextract as lx
LANGEXTRACT_AVAILABLE = True
except ImportError:
LANGEXTRACT_AVAILABLE = False
try:
from pypdf import PdfReader
PYPDF_AVAILABLE = True
except ImportError:
PYPDF_AVAILABLE = False
# --- CONFIG ---
st.set_page_config(
page_title="DocuTrace AI",
layout="wide",
page_icon="πŸ”",
initial_sidebar_state="expanded"
)
# --- LOAD CUSTOM CSS ---
def load_css():
css_file = os.path.join(os.path.dirname(__file__), "styles.css")
if os.path.exists(css_file):
with open(css_file, "r") as f:
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
load_css()
# --- SESSION STATE INIT ---
if "messages" not in st.session_state:
st.session_state.messages = []
if "extraction_count" not in st.session_state:
st.session_state.extraction_count = 0
if "pages_processed" not in st.session_state:
st.session_state.pages_processed = 0
# Load API Key
api_key = os.getenv("GEMINI_API_KEY")
if api_key and GENAI_AVAILABLE:
os.environ["LANGEXTRACT_API_KEY"] = api_key
# --- HELPER FUNCTIONS ---
def add_message(role: str, content: str, msg_type: str = "text"):
"""Add a message to the chat history"""
st.session_state.messages.append({
"role": role,
"content": content,
"type": msg_type,
"timestamp": datetime.now().strftime("%H:%M")
})
def render_chat_message(message):
"""Render a single chat message with styling"""
role = message["role"]
content = message["content"]
timestamp = message.get("timestamp", "")
if role == "user":
avatar = "πŸ‘€"
bubble_class = "user"
else:
avatar = "πŸ”"
bubble_class = "assistant"
html = f"""
<div class="chat-message {role}">
<div class="chat-avatar {role}">{avatar}</div>
<div class="chat-bubble {bubble_class}">
<div style="margin-bottom: 4px;">{content}</div>
<div style="font-size: 0.7rem; opacity: 0.6; text-align: right;">{timestamp}</div>
</div>
</div>
"""
return html
def render_result_card(title, content, icon="πŸ“„"):
"""Render an extraction result card"""
html = f"""
<div class="result-card">
<div class="result-card-header">
<div class="result-card-icon">{icon}</div>
<div class="result-card-title">{title}</div>
</div>
<div class="result-card-content">{content}</div>
<span class="evidence-tag">βœ“ Source Verified</span>
</div>
"""
return html
# --- SIDEBAR ---
with st.sidebar:
# Logo / Brand
st.markdown("""
<div style="text-align: center; padding: 1rem 0 2rem 0;">
<div style="font-size: 3rem; margin-bottom: 0.5rem;">πŸ”</div>
<div style="font-family: 'Outfit', sans-serif; font-size: 1.5rem; font-weight: 700;
background: linear-gradient(135deg, #f1f5f9 0%, #6366f1 100%);
-webkit-background-clip: text; -webkit-text-fill-color: transparent;">
DocuTrace
</div>
<div style="font-size: 0.8rem; color: #64748b; margin-top: 0.25rem;">
Verifiable AI Auditor
</div>
</div>
""", unsafe_allow_html=True)
st.markdown("### βš™οΈ Configuration")
# Model Selection with icons
model_choice = st.selectbox(
"AI Model",
["gemini-2.5-flash", "gemini-1.5-flash"],
help="Select the Gemini model for extraction"
)
# Status indicators
all_deps_ready = GENAI_AVAILABLE and LANGEXTRACT_AVAILABLE and PYPDF_AVAILABLE and api_key
if all_deps_ready:
st.markdown("""
<div style="display: flex; align-items: center; gap: 8px; padding: 0.75rem;
background: rgba(16, 185, 129, 0.15); border-radius: 8px; margin: 1rem 0;">
<div style="width: 8px; height: 8px; background: #10b981; border-radius: 50%;"></div>
<span style="color: #10b981; font-size: 0.85rem;">System Ready</span>
</div>
""", unsafe_allow_html=True)
else:
missing = []
if not api_key:
missing.append("API Key")
if not GENAI_AVAILABLE:
missing.append("google-generativeai")
if not LANGEXTRACT_AVAILABLE:
missing.append("langextract")
if not PYPDF_AVAILABLE:
missing.append("pypdf")
st.markdown(f"""
<div style="display: flex; align-items: center; gap: 8px; padding: 0.75rem;
background: rgba(245, 158, 11, 0.15); border-radius: 8px; margin: 1rem 0;">
<div style="width: 8px; height: 8px; background: #f59e0b; border-radius: 50%;"></div>
<span style="color: #f59e0b; font-size: 0.85rem;">Demo Mode</span>
</div>
""", unsafe_allow_html=True)
st.divider()
# File Upload
st.markdown("### πŸ“ Document")
uploaded_file = st.file_uploader(
"Upload PDF",
type=["pdf"],
help="Upload your document for analysis"
)
st.divider()
# Stats
st.markdown("### πŸ“Š Session Stats")
col1, col2 = st.columns(2)
with col1:
st.markdown(f"""
<div class="metric-container">
<div class="metric-value">{st.session_state.extraction_count}</div>
<div class="metric-label">Extractions</div>
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown(f"""
<div class="metric-container">
<div class="metric-value">{st.session_state.pages_processed}</div>
<div class="metric-label">Pages</div>
</div>
""", unsafe_allow_html=True)
st.divider()
# Footer
st.markdown("""
<div style="text-align: center; padding: 1rem 0; color: #64748b; font-size: 0.75rem;">
Powered by <strong>Google LangExtract</strong><br/>
& Gemini AI
</div>
""", unsafe_allow_html=True)
# --- MAIN CONTENT ---
# Hero Section
st.markdown("""
<div class="hero-section">
<h1 class="hero-title">πŸ“œ DocuTrace AI</h1>
<p class="hero-subtitle">Extract structured data from documents with <strong>Source Grounding</strong> & verifiable evidence</p>
<div class="hero-badge">
<span>●</span> Production Ready
</div>
</div>
""", unsafe_allow_html=True)
# Main content area
if uploaded_file:
# Read PDF
if PYPDF_AVAILABLE:
with st.spinner(""):
try:
reader = PdfReader(uploaded_file)
text = ""
page_count = min(5, len(reader.pages))
for i in range(page_count):
text += reader.pages[i].extract_text() + "\n"
st.session_state.pages_processed = page_count
# Success notification
st.markdown(f"""
<div style="display: flex; align-items: center; gap: 12px; padding: 1rem 1.5rem;
background: rgba(16, 185, 129, 0.1); border: 1px solid rgba(16, 185, 129, 0.3);
border-radius: 12px; margin-bottom: 1.5rem;">
<span style="font-size: 1.5rem;">βœ“</span>
<div>
<div style="color: #10b981; font-weight: 600;">Document Loaded</div>
<div style="color: #94a3b8; font-size: 0.85rem;">
{len(reader.pages)} pages total β€’ Analyzing first {page_count} pages
</div>
</div>
</div>
""", unsafe_allow_html=True)
except Exception as e:
st.error(f"Error reading PDF: {e}")
st.stop()
else:
st.warning("PyPDF module not installed. PDF parsing disabled.")
text = ""
# Query Section
st.markdown("### πŸ’¬ What would you like to extract?")
col1, col2 = st.columns([2, 1])
with col1:
topic = st.text_input(
"Search Topic",
placeholder="e.g., Risk Factors, Financial Data, Legal Terms...",
label_visibility="collapsed"
)
with col2:
fields = st.text_input(
"Fields",
placeholder="category, summary, impact",
label_visibility="collapsed"
)
# Chat History Display
if st.session_state.messages:
st.markdown("### πŸ“ Conversation")
chat_html = '<div class="chat-container">'
for msg in st.session_state.messages:
chat_html += render_chat_message(msg)
chat_html += '</div>'
st.markdown(chat_html, unsafe_allow_html=True)
# Action Button
col1, col2, col3 = st.columns([1, 1, 1])
with col2:
run_audit = st.button("πŸš€ Run Audit", type="primary", use_container_width=True)
if run_audit:
if not all_deps_ready:
# Demo mode - show simulated results
add_message("user", f"Extract **{topic or 'Key Information'}** with fields: {fields or 'auto-detect'}")
add_message("assistant", "⚠️ Running in Demo Mode. Install required dependencies (google-generativeai, langextract, pypdf) and set GEMINI_API_KEY for full functionality.")
# Show demo result cards
st.markdown("### πŸ” Demo Results")
st.markdown(render_result_card(
"Sample Extraction",
"This is a demonstration of the DocuTrace extraction interface. In production mode with proper dependencies installed, real AI-powered extractions with source grounding would appear here.",
"πŸ“Š"
), unsafe_allow_html=True)
st.session_state.extraction_count += 1
st.rerun()
elif not topic:
st.warning("Please enter a search topic")
else:
# Full extraction mode
add_message("user", f"Extract **{topic}** with fields: {fields or 'auto-detect'}")
with st.status("πŸ•΅οΈ Analyzing Document...", expanded=True) as status:
try:
prompt = f"Extract '{topic}'. Fields: {fields}."
status.write("πŸ” Scanning document content...")
examples = [
lx.data.ExampleData(
text="The company faces regulatory risks.",
extractions=[lx.data.Extraction(
extraction_class="item",
extraction_text="The company faces regulatory risks",
attributes={"category": "Legal"}
)]
)
]
status.write("πŸ€– AI is extracting data...")
result = lx.extract(
text_or_documents=text,
prompt_description=prompt,
examples=examples,
model_id=model_choice
)
extraction_count = len(result.extractions)
st.session_state.extraction_count += extraction_count
status.write(f"βœ… Found {extraction_count} items")
status.write("🎨 Generating evidence visualization...")
lx.io.save_annotated_documents([result], output_name="data.jsonl", output_dir=".")
html_obj = lx.visualize("data.jsonl")
html_content = html_obj.data
status.update(label="βœ… Audit Complete!", state="complete")
add_message("assistant", f"Found **{extraction_count}** relevant items for '{topic}'. Evidence is highlighted below with source verification.")
st.rerun()
except Exception as e:
status.update(label="❌ Error", state="error")
add_message("assistant", f"⚠️ Extraction failed: {str(e)}")
st.error(f"Extraction Failed: {e}")
# Results Display (after extraction)
if os.path.exists("data.jsonl") and LANGEXTRACT_AVAILABLE:
st.markdown("---")
st.markdown("""
<div style="display: flex; align-items: center; gap: 12px; margin-bottom: 1rem;">
<div style="font-size: 1.5rem;">πŸ”</div>
<div>
<div style="font-family: 'Outfit', sans-serif; font-size: 1.25rem; font-weight: 600; color: #f1f5f9;">
Verified Evidence
</div>
<div style="color: #64748b; font-size: 0.85rem;">
Click highlights to see source text
</div>
</div>
</div>
""", unsafe_allow_html=True)
try:
html_obj = lx.visualize("data.jsonl")
components.html(html_obj.data, height=600, scrolling=True)
except:
pass
else:
# Empty State
st.markdown("""
<div style="text-align: center; padding: 4rem 2rem;">
<div style="font-size: 4rem; margin-bottom: 1.5rem; opacity: 0.5;">πŸ“„</div>
<div style="font-family: 'Outfit', sans-serif; font-size: 1.5rem; font-weight: 600; color: #94a3b8; margin-bottom: 0.5rem;">
No Document Loaded
</div>
<div style="color: #64748b; max-width: 400px; margin: 0 auto;">
Upload a PDF from the sidebar to begin extracting structured data with verifiable source grounding.
</div>
</div>
""", unsafe_allow_html=True)
# Feature Cards
st.markdown("<br>", unsafe_allow_html=True)
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("""
<div class="glass-card" style="text-align: center;">
<div style="font-size: 2.5rem; margin-bottom: 1rem;">🎯</div>
<div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;">
Precision Extraction
</div>
<div style="color: #64748b; font-size: 0.9rem;">
Extract exactly what you need with AI-powered document understanding
</div>
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown("""
<div class="glass-card" style="text-align: center;">
<div style="font-size: 2.5rem; margin-bottom: 1rem;">βœ“</div>
<div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;">
Source Grounding
</div>
<div style="color: #64748b; font-size: 0.9rem;">
Every extraction is linked to its exact source location in the document
</div>
</div>
""", unsafe_allow_html=True)
with col3:
st.markdown("""
<div class="glass-card" style="text-align: center;">
<div style="font-size: 2.5rem; margin-bottom: 1rem;">πŸ”’</div>
<div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;">
Enterprise Ready
</div>
<div style="color: #64748b; font-size: 0.9rem;">
Built for high-stakes domains: Legal, Finance, and Compliance
</div>
</div>
""", unsafe_allow_html=True)