Spaces:

jameszokah
/

doctrace

Sleeping

App Files Files Community

doctrace / app.py

jameszokah

Initial commit

9243bca 3 months ago

raw

history blame contribute delete

15.9 kB

	import streamlit as st
	import os
	import streamlit.components.v1 as components
	from datetime import datetime

	# --- Try to import optional dependencies ---
	try:
	import google.generativeai as genai
	GENAI_AVAILABLE = True
	except ImportError:
	GENAI_AVAILABLE = False

	try:
	import langextract as lx
	LANGEXTRACT_AVAILABLE = True
	except ImportError:
	LANGEXTRACT_AVAILABLE = False

	try:
	from pypdf import PdfReader
	PYPDF_AVAILABLE = True
	except ImportError:
	PYPDF_AVAILABLE = False

	# --- CONFIG ---
	st.set_page_config(
	page_title="DocuTrace AI",
	layout="wide",
	page_icon="🔍",
	initial_sidebar_state="expanded"
	)

	# --- LOAD CUSTOM CSS ---
	def load_css():
	css_file = os.path.join(os.path.dirname(__file__), "styles.css")
	if os.path.exists(css_file):
	with open(css_file, "r") as f:
	st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)

	load_css()

	# --- SESSION STATE INIT ---
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "extraction_count" not in st.session_state:
	st.session_state.extraction_count = 0
	if "pages_processed" not in st.session_state:
	st.session_state.pages_processed = 0

	# Load API Key
	api_key = os.getenv("GEMINI_API_KEY")
	if api_key and GENAI_AVAILABLE:
	os.environ["LANGEXTRACT_API_KEY"] = api_key

	# --- HELPER FUNCTIONS ---
	def add_message(role: str, content: str, msg_type: str = "text"):
	"""Add a message to the chat history"""
	st.session_state.messages.append({
	"role": role,
	"content": content,
	"type": msg_type,
	"timestamp": datetime.now().strftime("%H:%M")
	})

	def render_chat_message(message):
	"""Render a single chat message with styling"""
	role = message["role"]
	content = message["content"]
	timestamp = message.get("timestamp", "")

	if role == "user":
	avatar = "👤"
	bubble_class = "user"
	else:
	avatar = "🔍"
	bubble_class = "assistant"

	html = f"""
	<div class="chat-message {role}">
	<div class="chat-avatar {role}">{avatar}</div>
	<div class="chat-bubble {bubble_class}">
	<div style="margin-bottom: 4px;">{content}</div>
	<div style="font-size: 0.7rem; opacity: 0.6; text-align: right;">{timestamp}</div>
	</div>
	</div>
	"""
	return html

	def render_result_card(title, content, icon="📄"):
	"""Render an extraction result card"""
	html = f"""
	<div class="result-card">
	<div class="result-card-header">
	<div class="result-card-icon">{icon}</div>
	<div class="result-card-title">{title}</div>
	</div>
	<div class="result-card-content">{content}</div>
	<span class="evidence-tag">✓ Source Verified</span>
	</div>
	"""
	return html

	# --- SIDEBAR ---
	with st.sidebar:
	# Logo / Brand
	st.markdown("""
	<div style="text-align: center; padding: 1rem 0 2rem 0;">
	<div style="font-size: 3rem; margin-bottom: 0.5rem;">🔍</div>
	<div style="font-family: 'Outfit', sans-serif; font-size: 1.5rem; font-weight: 700;
	background: linear-gradient(135deg, #f1f5f9 0%, #6366f1 100%);
	-webkit-background-clip: text; -webkit-text-fill-color: transparent;">
	DocuTrace
	</div>
	<div style="font-size: 0.8rem; color: #64748b; margin-top: 0.25rem;">
	Verifiable AI Auditor
	</div>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("### ⚙️ Configuration")

	# Model Selection with icons
	model_choice = st.selectbox(
	"AI Model",
	["gemini-2.5-flash", "gemini-1.5-flash"],
	help="Select the Gemini model for extraction"
	)

	# Status indicators
	all_deps_ready = GENAI_AVAILABLE and LANGEXTRACT_AVAILABLE and PYPDF_AVAILABLE and api_key

	if all_deps_ready:
	st.markdown("""
	<div style="display: flex; align-items: center; gap: 8px; padding: 0.75rem;
	background: rgba(16, 185, 129, 0.15); border-radius: 8px; margin: 1rem 0;">
	<div style="width: 8px; height: 8px; background: #10b981; border-radius: 50%;"></div>
	<span style="color: #10b981; font-size: 0.85rem;">System Ready</span>
	</div>
	""", unsafe_allow_html=True)
	else:
	missing = []
	if not api_key:
	missing.append("API Key")
	if not GENAI_AVAILABLE:
	missing.append("google-generativeai")
	if not LANGEXTRACT_AVAILABLE:
	missing.append("langextract")
	if not PYPDF_AVAILABLE:
	missing.append("pypdf")

	st.markdown(f"""
	<div style="display: flex; align-items: center; gap: 8px; padding: 0.75rem;
	background: rgba(245, 158, 11, 0.15); border-radius: 8px; margin: 1rem 0;">
	<div style="width: 8px; height: 8px; background: #f59e0b; border-radius: 50%;"></div>
	<span style="color: #f59e0b; font-size: 0.85rem;">Demo Mode</span>
	</div>
	""", unsafe_allow_html=True)

	st.divider()

	# File Upload
	st.markdown("### 📁 Document")
	uploaded_file = st.file_uploader(
	"Upload PDF",
	type=["pdf"],
	help="Upload your document for analysis"
	)

	st.divider()

	# Stats
	st.markdown("### 📊 Session Stats")
	col1, col2 = st.columns(2)
	with col1:
	st.markdown(f"""
	<div class="metric-container">
	<div class="metric-value">{st.session_state.extraction_count}</div>
	<div class="metric-label">Extractions</div>
	</div>
	""", unsafe_allow_html=True)
	with col2:
	st.markdown(f"""
	<div class="metric-container">
	<div class="metric-value">{st.session_state.pages_processed}</div>
	<div class="metric-label">Pages</div>
	</div>
	""", unsafe_allow_html=True)

	st.divider()

	# Footer
	st.markdown("""
	<div style="text-align: center; padding: 1rem 0; color: #64748b; font-size: 0.75rem;">
	Powered by <strong>Google LangExtract</strong><br/>
	& Gemini AI
	</div>
	""", unsafe_allow_html=True)

	# --- MAIN CONTENT ---
	# Hero Section
	st.markdown("""
	<div class="hero-section">
	<h1 class="hero-title">📜 DocuTrace AI</h1>
	<p class="hero-subtitle">Extract structured data from documents with <strong>Source Grounding</strong> & verifiable evidence</p>
	<div class="hero-badge">
	<span>●</span> Production Ready
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Main content area
	if uploaded_file:
	# Read PDF
	if PYPDF_AVAILABLE:
	with st.spinner(""):
	try:
	reader = PdfReader(uploaded_file)
	text = ""
	page_count = min(5, len(reader.pages))
	for i in range(page_count):
	text += reader.pages[i].extract_text() + "\n"

	st.session_state.pages_processed = page_count

	# Success notification
	st.markdown(f"""
	<div style="display: flex; align-items: center; gap: 12px; padding: 1rem 1.5rem;
	background: rgba(16, 185, 129, 0.1); border: 1px solid rgba(16, 185, 129, 0.3);
	border-radius: 12px; margin-bottom: 1.5rem;">
	<span style="font-size: 1.5rem;">✓</span>
	<div>
	<div style="color: #10b981; font-weight: 600;">Document Loaded</div>
	<div style="color: #94a3b8; font-size: 0.85rem;">
	{len(reader.pages)} pages total • Analyzing first {page_count} pages
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	except Exception as e:
	st.error(f"Error reading PDF: {e}")
	st.stop()
	else:
	st.warning("PyPDF module not installed. PDF parsing disabled.")
	text = ""

	# Query Section
	st.markdown("### 💬 What would you like to extract?")

	col1, col2 = st.columns([2, 1])
	with col1:
	topic = st.text_input(
	"Search Topic",
	placeholder="e.g., Risk Factors, Financial Data, Legal Terms...",
	label_visibility="collapsed"
	)
	with col2:
	fields = st.text_input(
	"Fields",
	placeholder="category, summary, impact",
	label_visibility="collapsed"
	)

	# Chat History Display
	if st.session_state.messages:
	st.markdown("### 📝 Conversation")
	chat_html = '<div class="chat-container">'
	for msg in st.session_state.messages:
	chat_html += render_chat_message(msg)
	chat_html += '</div>'
	st.markdown(chat_html, unsafe_allow_html=True)

	# Action Button
	col1, col2, col3 = st.columns([1, 1, 1])
	with col2:
	run_audit = st.button("🚀 Run Audit", type="primary", use_container_width=True)

	if run_audit:
	if not all_deps_ready:
	# Demo mode - show simulated results
	add_message("user", f"Extract {topic or 'Key Information'} with fields: {fields or 'auto-detect'}")
	add_message("assistant", "⚠️ Running in Demo Mode. Install required dependencies (google-generativeai, langextract, pypdf) and set GEMINI_API_KEY for full functionality.")

	# Show demo result cards
	st.markdown("### 🔍 Demo Results")
	st.markdown(render_result_card(
	"Sample Extraction",
	"This is a demonstration of the DocuTrace extraction interface. In production mode with proper dependencies installed, real AI-powered extractions with source grounding would appear here.",
	"📊"
	), unsafe_allow_html=True)
	st.session_state.extraction_count += 1
	st.rerun()
	elif not topic:
	st.warning("Please enter a search topic")
	else:
	# Full extraction mode
	add_message("user", f"Extract {topic} with fields: {fields or 'auto-detect'}")

	with st.status("🕵️ Analyzing Document...", expanded=True) as status:
	try:
	prompt = f"Extract '{topic}'. Fields: {fields}."
	status.write("🔍 Scanning document content...")

	examples = [
	lx.data.ExampleData(
	text="The company faces regulatory risks.",
	extractions=[lx.data.Extraction(
	extraction_class="item",
	extraction_text="The company faces regulatory risks",
	attributes={"category": "Legal"}
	)]
	)
	]

	status.write("🤖 AI is extracting data...")

	result = lx.extract(
	text_or_documents=text,
	prompt_description=prompt,
	examples=examples,
	model_id=model_choice
	)

	extraction_count = len(result.extractions)
	st.session_state.extraction_count += extraction_count

	status.write(f"✅ Found {extraction_count} items")
	status.write("🎨 Generating evidence visualization...")
	lx.io.save_annotated_documents([result], output_name="data.jsonl", output_dir=".")
	html_obj = lx.visualize("data.jsonl")
	html_content = html_obj.data

	status.update(label="✅ Audit Complete!", state="complete")
	add_message("assistant", f"Found {extraction_count} relevant items for '{topic}'. Evidence is highlighted below with source verification.")
	st.rerun()

	except Exception as e:
	status.update(label="❌ Error", state="error")
	add_message("assistant", f"⚠️ Extraction failed: {str(e)}")
	st.error(f"Extraction Failed: {e}")

	# Results Display (after extraction)
	if os.path.exists("data.jsonl") and LANGEXTRACT_AVAILABLE:
	st.markdown("---")
	st.markdown("""
	<div style="display: flex; align-items: center; gap: 12px; margin-bottom: 1rem;">
	<div style="font-size: 1.5rem;">🔍</div>
	<div>
	<div style="font-family: 'Outfit', sans-serif; font-size: 1.25rem; font-weight: 600; color: #f1f5f9;">
	Verified Evidence
	</div>
	<div style="color: #64748b; font-size: 0.85rem;">
	Click highlights to see source text
	</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	try:
	html_obj = lx.visualize("data.jsonl")
	components.html(html_obj.data, height=600, scrolling=True)
	except:
	pass

	else:
	# Empty State
	st.markdown("""
	<div style="text-align: center; padding: 4rem 2rem;">
	<div style="font-size: 4rem; margin-bottom: 1.5rem; opacity: 0.5;">📄</div>
	<div style="font-family: 'Outfit', sans-serif; font-size: 1.5rem; font-weight: 600; color: #94a3b8; margin-bottom: 0.5rem;">
	No Document Loaded
	</div>
	<div style="color: #64748b; max-width: 400px; margin: 0 auto;">
	Upload a PDF from the sidebar to begin extracting structured data with verifiable source grounding.
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Feature Cards
	st.markdown("<br>", unsafe_allow_html=True)

	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("""
	<div class="glass-card" style="text-align: center;">
	<div style="font-size: 2.5rem; margin-bottom: 1rem;">🎯</div>
	<div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;">
	Precision Extraction
	</div>
	<div style="color: #64748b; font-size: 0.9rem;">
	Extract exactly what you need with AI-powered document understanding
	</div>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.markdown("""
	<div class="glass-card" style="text-align: center;">
	<div style="font-size: 2.5rem; margin-bottom: 1rem;">✓</div>
	<div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;">
	Source Grounding
	</div>
	<div style="color: #64748b; font-size: 0.9rem;">
	Every extraction is linked to its exact source location in the document
	</div>
	</div>
	""", unsafe_allow_html=True)

	with col3:
	st.markdown("""
	<div class="glass-card" style="text-align: center;">
	<div style="font-size: 2.5rem; margin-bottom: 1rem;">🔒</div>
	<div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;">
	Enterprise Ready
	</div>
	<div style="color: #64748b; font-size: 0.9rem;">
	Built for high-stakes domains: Legal, Finance, and Compliance
	</div>
	</div>
	""", unsafe_allow_html=True)