Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import streamlit.components.v1 as components | |
| from datetime import datetime | |
| # --- Try to import optional dependencies --- | |
| try: | |
| import google.generativeai as genai | |
| GENAI_AVAILABLE = True | |
| except ImportError: | |
| GENAI_AVAILABLE = False | |
| try: | |
| import langextract as lx | |
| LANGEXTRACT_AVAILABLE = True | |
| except ImportError: | |
| LANGEXTRACT_AVAILABLE = False | |
| try: | |
| from pypdf import PdfReader | |
| PYPDF_AVAILABLE = True | |
| except ImportError: | |
| PYPDF_AVAILABLE = False | |
| # --- CONFIG --- | |
| st.set_page_config( | |
| page_title="DocuTrace AI", | |
| layout="wide", | |
| page_icon="π", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # --- LOAD CUSTOM CSS --- | |
| def load_css(): | |
| css_file = os.path.join(os.path.dirname(__file__), "styles.css") | |
| if os.path.exists(css_file): | |
| with open(css_file, "r") as f: | |
| st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True) | |
| load_css() | |
| # --- SESSION STATE INIT --- | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [] | |
| if "extraction_count" not in st.session_state: | |
| st.session_state.extraction_count = 0 | |
| if "pages_processed" not in st.session_state: | |
| st.session_state.pages_processed = 0 | |
| # Load API Key | |
| api_key = os.getenv("GEMINI_API_KEY") | |
| if api_key and GENAI_AVAILABLE: | |
| os.environ["LANGEXTRACT_API_KEY"] = api_key | |
| # --- HELPER FUNCTIONS --- | |
| def add_message(role: str, content: str, msg_type: str = "text"): | |
| """Add a message to the chat history""" | |
| st.session_state.messages.append({ | |
| "role": role, | |
| "content": content, | |
| "type": msg_type, | |
| "timestamp": datetime.now().strftime("%H:%M") | |
| }) | |
| def render_chat_message(message): | |
| """Render a single chat message with styling""" | |
| role = message["role"] | |
| content = message["content"] | |
| timestamp = message.get("timestamp", "") | |
| if role == "user": | |
| avatar = "π€" | |
| bubble_class = "user" | |
| else: | |
| avatar = "π" | |
| bubble_class = "assistant" | |
| html = f""" | |
| <div class="chat-message {role}"> | |
| <div class="chat-avatar {role}">{avatar}</div> | |
| <div class="chat-bubble {bubble_class}"> | |
| <div style="margin-bottom: 4px;">{content}</div> | |
| <div style="font-size: 0.7rem; opacity: 0.6; text-align: right;">{timestamp}</div> | |
| </div> | |
| </div> | |
| """ | |
| return html | |
| def render_result_card(title, content, icon="π"): | |
| """Render an extraction result card""" | |
| html = f""" | |
| <div class="result-card"> | |
| <div class="result-card-header"> | |
| <div class="result-card-icon">{icon}</div> | |
| <div class="result-card-title">{title}</div> | |
| </div> | |
| <div class="result-card-content">{content}</div> | |
| <span class="evidence-tag">β Source Verified</span> | |
| </div> | |
| """ | |
| return html | |
| # --- SIDEBAR --- | |
| with st.sidebar: | |
| # Logo / Brand | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 1rem 0 2rem 0;"> | |
| <div style="font-size: 3rem; margin-bottom: 0.5rem;">π</div> | |
| <div style="font-family: 'Outfit', sans-serif; font-size: 1.5rem; font-weight: 700; | |
| background: linear-gradient(135deg, #f1f5f9 0%, #6366f1 100%); | |
| -webkit-background-clip: text; -webkit-text-fill-color: transparent;"> | |
| DocuTrace | |
| </div> | |
| <div style="font-size: 0.8rem; color: #64748b; margin-top: 0.25rem;"> | |
| Verifiable AI Auditor | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown("### βοΈ Configuration") | |
| # Model Selection with icons | |
| model_choice = st.selectbox( | |
| "AI Model", | |
| ["gemini-2.5-flash", "gemini-1.5-flash"], | |
| help="Select the Gemini model for extraction" | |
| ) | |
| # Status indicators | |
| all_deps_ready = GENAI_AVAILABLE and LANGEXTRACT_AVAILABLE and PYPDF_AVAILABLE and api_key | |
| if all_deps_ready: | |
| st.markdown(""" | |
| <div style="display: flex; align-items: center; gap: 8px; padding: 0.75rem; | |
| background: rgba(16, 185, 129, 0.15); border-radius: 8px; margin: 1rem 0;"> | |
| <div style="width: 8px; height: 8px; background: #10b981; border-radius: 50%;"></div> | |
| <span style="color: #10b981; font-size: 0.85rem;">System Ready</span> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| missing = [] | |
| if not api_key: | |
| missing.append("API Key") | |
| if not GENAI_AVAILABLE: | |
| missing.append("google-generativeai") | |
| if not LANGEXTRACT_AVAILABLE: | |
| missing.append("langextract") | |
| if not PYPDF_AVAILABLE: | |
| missing.append("pypdf") | |
| st.markdown(f""" | |
| <div style="display: flex; align-items: center; gap: 8px; padding: 0.75rem; | |
| background: rgba(245, 158, 11, 0.15); border-radius: 8px; margin: 1rem 0;"> | |
| <div style="width: 8px; height: 8px; background: #f59e0b; border-radius: 50%;"></div> | |
| <span style="color: #f59e0b; font-size: 0.85rem;">Demo Mode</span> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.divider() | |
| # File Upload | |
| st.markdown("### π Document") | |
| uploaded_file = st.file_uploader( | |
| "Upload PDF", | |
| type=["pdf"], | |
| help="Upload your document for analysis" | |
| ) | |
| st.divider() | |
| # Stats | |
| st.markdown("### π Session Stats") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown(f""" | |
| <div class="metric-container"> | |
| <div class="metric-value">{st.session_state.extraction_count}</div> | |
| <div class="metric-label">Extractions</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| st.markdown(f""" | |
| <div class="metric-container"> | |
| <div class="metric-value">{st.session_state.pages_processed}</div> | |
| <div class="metric-label">Pages</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.divider() | |
| # Footer | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 1rem 0; color: #64748b; font-size: 0.75rem;"> | |
| Powered by <strong>Google LangExtract</strong><br/> | |
| & Gemini AI | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # --- MAIN CONTENT --- | |
| # Hero Section | |
| st.markdown(""" | |
| <div class="hero-section"> | |
| <h1 class="hero-title">π DocuTrace AI</h1> | |
| <p class="hero-subtitle">Extract structured data from documents with <strong>Source Grounding</strong> & verifiable evidence</p> | |
| <div class="hero-badge"> | |
| <span>β</span> Production Ready | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Main content area | |
| if uploaded_file: | |
| # Read PDF | |
| if PYPDF_AVAILABLE: | |
| with st.spinner(""): | |
| try: | |
| reader = PdfReader(uploaded_file) | |
| text = "" | |
| page_count = min(5, len(reader.pages)) | |
| for i in range(page_count): | |
| text += reader.pages[i].extract_text() + "\n" | |
| st.session_state.pages_processed = page_count | |
| # Success notification | |
| st.markdown(f""" | |
| <div style="display: flex; align-items: center; gap: 12px; padding: 1rem 1.5rem; | |
| background: rgba(16, 185, 129, 0.1); border: 1px solid rgba(16, 185, 129, 0.3); | |
| border-radius: 12px; margin-bottom: 1.5rem;"> | |
| <span style="font-size: 1.5rem;">β</span> | |
| <div> | |
| <div style="color: #10b981; font-weight: 600;">Document Loaded</div> | |
| <div style="color: #94a3b8; font-size: 0.85rem;"> | |
| {len(reader.pages)} pages total β’ Analyzing first {page_count} pages | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| except Exception as e: | |
| st.error(f"Error reading PDF: {e}") | |
| st.stop() | |
| else: | |
| st.warning("PyPDF module not installed. PDF parsing disabled.") | |
| text = "" | |
| # Query Section | |
| st.markdown("### π¬ What would you like to extract?") | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| topic = st.text_input( | |
| "Search Topic", | |
| placeholder="e.g., Risk Factors, Financial Data, Legal Terms...", | |
| label_visibility="collapsed" | |
| ) | |
| with col2: | |
| fields = st.text_input( | |
| "Fields", | |
| placeholder="category, summary, impact", | |
| label_visibility="collapsed" | |
| ) | |
| # Chat History Display | |
| if st.session_state.messages: | |
| st.markdown("### π Conversation") | |
| chat_html = '<div class="chat-container">' | |
| for msg in st.session_state.messages: | |
| chat_html += render_chat_message(msg) | |
| chat_html += '</div>' | |
| st.markdown(chat_html, unsafe_allow_html=True) | |
| # Action Button | |
| col1, col2, col3 = st.columns([1, 1, 1]) | |
| with col2: | |
| run_audit = st.button("π Run Audit", type="primary", use_container_width=True) | |
| if run_audit: | |
| if not all_deps_ready: | |
| # Demo mode - show simulated results | |
| add_message("user", f"Extract **{topic or 'Key Information'}** with fields: {fields or 'auto-detect'}") | |
| add_message("assistant", "β οΈ Running in Demo Mode. Install required dependencies (google-generativeai, langextract, pypdf) and set GEMINI_API_KEY for full functionality.") | |
| # Show demo result cards | |
| st.markdown("### π Demo Results") | |
| st.markdown(render_result_card( | |
| "Sample Extraction", | |
| "This is a demonstration of the DocuTrace extraction interface. In production mode with proper dependencies installed, real AI-powered extractions with source grounding would appear here.", | |
| "π" | |
| ), unsafe_allow_html=True) | |
| st.session_state.extraction_count += 1 | |
| st.rerun() | |
| elif not topic: | |
| st.warning("Please enter a search topic") | |
| else: | |
| # Full extraction mode | |
| add_message("user", f"Extract **{topic}** with fields: {fields or 'auto-detect'}") | |
| with st.status("π΅οΈ Analyzing Document...", expanded=True) as status: | |
| try: | |
| prompt = f"Extract '{topic}'. Fields: {fields}." | |
| status.write("π Scanning document content...") | |
| examples = [ | |
| lx.data.ExampleData( | |
| text="The company faces regulatory risks.", | |
| extractions=[lx.data.Extraction( | |
| extraction_class="item", | |
| extraction_text="The company faces regulatory risks", | |
| attributes={"category": "Legal"} | |
| )] | |
| ) | |
| ] | |
| status.write("π€ AI is extracting data...") | |
| result = lx.extract( | |
| text_or_documents=text, | |
| prompt_description=prompt, | |
| examples=examples, | |
| model_id=model_choice | |
| ) | |
| extraction_count = len(result.extractions) | |
| st.session_state.extraction_count += extraction_count | |
| status.write(f"β Found {extraction_count} items") | |
| status.write("π¨ Generating evidence visualization...") | |
| lx.io.save_annotated_documents([result], output_name="data.jsonl", output_dir=".") | |
| html_obj = lx.visualize("data.jsonl") | |
| html_content = html_obj.data | |
| status.update(label="β Audit Complete!", state="complete") | |
| add_message("assistant", f"Found **{extraction_count}** relevant items for '{topic}'. Evidence is highlighted below with source verification.") | |
| st.rerun() | |
| except Exception as e: | |
| status.update(label="β Error", state="error") | |
| add_message("assistant", f"β οΈ Extraction failed: {str(e)}") | |
| st.error(f"Extraction Failed: {e}") | |
| # Results Display (after extraction) | |
| if os.path.exists("data.jsonl") and LANGEXTRACT_AVAILABLE: | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style="display: flex; align-items: center; gap: 12px; margin-bottom: 1rem;"> | |
| <div style="font-size: 1.5rem;">π</div> | |
| <div> | |
| <div style="font-family: 'Outfit', sans-serif; font-size: 1.25rem; font-weight: 600; color: #f1f5f9;"> | |
| Verified Evidence | |
| </div> | |
| <div style="color: #64748b; font-size: 0.85rem;"> | |
| Click highlights to see source text | |
| </div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| try: | |
| html_obj = lx.visualize("data.jsonl") | |
| components.html(html_obj.data, height=600, scrolling=True) | |
| except: | |
| pass | |
| else: | |
| # Empty State | |
| st.markdown(""" | |
| <div style="text-align: center; padding: 4rem 2rem;"> | |
| <div style="font-size: 4rem; margin-bottom: 1.5rem; opacity: 0.5;">π</div> | |
| <div style="font-family: 'Outfit', sans-serif; font-size: 1.5rem; font-weight: 600; color: #94a3b8; margin-bottom: 0.5rem;"> | |
| No Document Loaded | |
| </div> | |
| <div style="color: #64748b; max-width: 400px; margin: 0 auto;"> | |
| Upload a PDF from the sidebar to begin extracting structured data with verifiable source grounding. | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Feature Cards | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown(""" | |
| <div class="glass-card" style="text-align: center;"> | |
| <div style="font-size: 2.5rem; margin-bottom: 1rem;">π―</div> | |
| <div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;"> | |
| Precision Extraction | |
| </div> | |
| <div style="color: #64748b; font-size: 0.9rem;"> | |
| Extract exactly what you need with AI-powered document understanding | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| st.markdown(""" | |
| <div class="glass-card" style="text-align: center;"> | |
| <div style="font-size: 2.5rem; margin-bottom: 1rem;">β</div> | |
| <div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;"> | |
| Source Grounding | |
| </div> | |
| <div style="color: #64748b; font-size: 0.9rem;"> | |
| Every extraction is linked to its exact source location in the document | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col3: | |
| st.markdown(""" | |
| <div class="glass-card" style="text-align: center;"> | |
| <div style="font-size: 2.5rem; margin-bottom: 1rem;">π</div> | |
| <div style="font-family: 'Outfit', sans-serif; font-size: 1.1rem; font-weight: 600; color: #f1f5f9; margin-bottom: 0.5rem;"> | |
| Enterprise Ready | |
| </div> | |
| <div style="color: #64748b; font-size: 0.9rem;"> | |
| Built for high-stakes domains: Legal, Finance, and Compliance | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |