""" Strike Capital AI Diligence Automation - Gradio Application V0 Document Generation from Harmonic PDFs. """ import os import re import tempfile import gradio as gr import markdown2 from fpdf import FPDF from config import validate_config from pdf_extractor import extract_from_pdf from pinecone_store import store_extracted_data, store_v0_document, store_sentence_chunks from v0_generator import generate_v0_document # Validate configuration on startup try: validate_config() print("[OK] Configuration validated successfully") except ValueError as e: print(f"[ERROR] Configuration error: {e}") # Custom CSS for Strike Capital branding STRIKE_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); body { font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; line-height: 1.6; color: #1a1a1a; max-width: 900px; margin: 0 auto; padding: 40px; background: #ffffff; } h1 { color: #0a0a0a; font-weight: 700; font-size: 28px; border-bottom: 3px solid #2563eb; padding-bottom: 12px; margin-bottom: 24px; } h2 { color: #0a0a0a; font-weight: 600; font-size: 20px; margin-top: 32px; margin-bottom: 16px; padding-bottom: 8px; border-bottom: 1px solid #e5e5e5; } h3 { color: #262626; font-weight: 600; font-size: 16px; margin-top: 24px; } p { margin-bottom: 16px; } ul, ol { margin-bottom: 16px; padding-left: 24px; } li { margin-bottom: 8px; } /* Table wrapper for horizontal scroll */ .table-wrapper { overflow-x: auto; margin: 16px 0; } table { width: 100%; min-width: 600px; border-collapse: collapse; font-size: 13px; display: block; overflow-x: auto; } th { background: #f5f5f5; font-weight: 600; text-align: left; padding: 10px 8px; border: 1px solid #e5e5e5; white-space: nowrap; } td { padding: 8px; border: 1px solid #e5e5e5; min-width: 80px; } tr:nth-child(even) { background: #fafafa; } strong { font-weight: 600; } em { color: #525252; } hr { border: none; border-top: 1px solid #e5e5e5; margin: 24px 0; } code { background: #f5f5f5; padding: 2px 6px; border-radius: 4px; font-size: 14px; } blockquote { border-left: 4px solid #2563eb; padding-left: 16px; margin: 16px 0; color: #525252; } .header-meta { color: #737373; font-size: 14px; margin-bottom: 24px; } @media print { body { padding: 20px; } h2 { page-break-after: avoid; } } """ def process_pdf(pdf_file, progress=gr.Progress()): """ Main processing pipeline: PDF → Extract → Generate V0 → Store. Args: pdf_file: Uploaded PDF file progress: Gradio progress tracker Returns: Tuple of (markdown_content, html_preview, status_message) """ if pdf_file is None: return "", "

Please upload a Harmonic PDF report.

", "⚠️ No file uploaded" try: # Step 1: Extract data from PDF (returns extracted_data AND sentence_chunks) progress(0.1, desc="📄 Extracting text and chunking sentences...") def extraction_progress(batch, total): progress(0.1 + (0.3 * batch / total), desc=f"🔍 Processing batch {batch}/{total}...") extracted_data, sentence_chunks = extract_from_pdf(pdf_file.name, progress_callback=extraction_progress) # Get company name company_name = extracted_data.get("company_info", {}).get("name", "Unknown Company") progress(0.4, desc=f"✓ Extracted data for {company_name}") # Step 2: Store sentence chunks in Pinecone (one vector per sentence) progress(0.45, desc=f"💾 Storing {len(sentence_chunks)} sentence chunks in Pinecone...") chunks_result = store_sentence_chunks(company_name, sentence_chunks) # Step 3: Store extracted sections in Pinecone progress(0.55, desc="💾 Storing extracted sections...") store_result = store_extracted_data(company_name, extracted_data) # Step 4: Generate V0 document def generation_progress(step, total, section_name): progress(0.6 + (0.35 * step / total), desc=f"✍️ Generating: {section_name}...") v0_document = generate_v0_document( extracted_data, company_name, progress_callback=generation_progress ) # Step 5: Store V0 document progress(0.95, desc="💾 Saving V0 document...") store_v0_document(company_name, v0_document) # Convert to HTML for preview html_content = markdown2.markdown( v0_document, extras=["tables", "fenced-code-blocks", "header-ids"] ) html_preview = f"{html_content}" progress(1.0, desc="✓ Complete!") status = f"✅ Successfully generated V0 for **{company_name}**\n\n" status += f"- Sentence chunks stored: {chunks_result.get('chunks_stored', 'N/A')}\n" status += f"- Sections stored: {store_result.get('sections_stored', 'N/A')}\n" status += f"- Document version: v0" return v0_document, html_preview, status except Exception as e: error_msg = f"❌ Error: {str(e)}" return "", f"

{error_msg}

", error_msg def export_to_html(markdown_content, auto_print=False): """Export markdown to styled HTML file with optional auto-print dialog.""" if not markdown_content: return None html_content = markdown2.markdown( markdown_content, extras=["tables", "fenced-code-blocks", "header-ids"] ) # Auto-print script (opens print dialog when page loads) print_script = """ """ if auto_print else "" full_html = f""" Strike Capital - V0 Diligence Document {print_script} {html_content} """ # Save to temp file with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f: f.write(full_html) return f.name def export_to_pdf(markdown_content): """Export markdown to a real PDF file using fpdf2.""" if not markdown_content: return None # Create PDF with Unicode support pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() # Use built-in fonts (Helvetica for body, Courier for code) # Note: For full Unicode support, you'd need to add a Unicode font # Process markdown content line by line lines = markdown_content.split('\n') for line in lines: line = line.strip() if not line: pdf.ln(4) continue # Handle headers if line.startswith('# '): pdf.set_font('Helvetica', 'B', 20) pdf.set_text_color(10, 10, 10) text = line[2:].strip() pdf.multi_cell(0, 10, clean_text_for_pdf(text)) pdf.ln(2) # Add underline pdf.set_draw_color(37, 99, 235) pdf.set_line_width(0.8) pdf.line(10, pdf.get_y(), 200, pdf.get_y()) pdf.ln(6) elif line.startswith('## '): pdf.set_font('Helvetica', 'B', 16) pdf.set_text_color(10, 10, 10) text = line[3:].strip() pdf.ln(6) pdf.multi_cell(0, 8, clean_text_for_pdf(text)) pdf.ln(2) elif line.startswith('### '): pdf.set_font('Helvetica', 'B', 13) pdf.set_text_color(38, 38, 38) text = line[4:].strip() pdf.ln(4) pdf.multi_cell(0, 7, clean_text_for_pdf(text)) pdf.ln(2) elif line.startswith('- ') or line.startswith('* '): pdf.set_font('Helvetica', '', 11) pdf.set_text_color(26, 26, 26) text = line[2:].strip() # Bullet point pdf.cell(8, 6, chr(149)) # bullet character pdf.multi_cell(0, 6, clean_text_for_pdf(text)) elif re.match(r'^\d+\.\s', line): pdf.set_font('Helvetica', '', 11) pdf.set_text_color(26, 26, 26) pdf.multi_cell(0, 6, clean_text_for_pdf(line)) elif line.startswith('**') and line.endswith('**'): pdf.set_font('Helvetica', 'B', 11) pdf.set_text_color(26, 26, 26) text = line.strip('*') pdf.multi_cell(0, 6, clean_text_for_pdf(text)) elif line.startswith('---') or line.startswith('***'): pdf.ln(4) pdf.set_draw_color(229, 229, 229) pdf.set_line_width(0.3) pdf.line(10, pdf.get_y(), 200, pdf.get_y()) pdf.ln(4) else: # Regular paragraph pdf.set_font('Helvetica', '', 11) pdf.set_text_color(26, 26, 26) # Handle inline bold (**text**) text = clean_text_for_pdf(line) pdf.multi_cell(0, 6, text) # Save to temp file with tempfile.NamedTemporaryFile(mode='wb', suffix='.pdf', delete=False) as f: pdf.output(f.name) return f.name def clean_text_for_pdf(text): """Clean text for PDF output - remove markdown formatting and handle special chars.""" # Remove bold markers text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Remove italic markers text = re.sub(r'\*(.+?)\*', r'\1', text) # Remove inline code markers text = re.sub(r'`(.+?)`', r'\1', text) # Replace special Unicode chars that might cause issues text = text.replace('→', '->') text = text.replace('⚠', '[!]') text = text.replace('✓', '[OK]') text = text.replace('✅', '[OK]') text = text.replace('❌', '[X]') text = text.replace('📄', '') text = text.replace('📊', '') text = text.replace('📋', '') text = text.replace('💾', '') text = text.replace('🔍', '') text = text.replace('✍️', '') text = text.replace('🚀', '') text = text.replace('📤', '') text = text.replace('📥', '') text = text.replace('🎯', '') # Encode to latin-1 (what FPDF uses by default), replacing unknown chars text = text.encode('latin-1', errors='replace').decode('latin-1') return text # Build Gradio Interface with gr.Blocks( title="Strike Capital - AI Diligence", theme=gr.themes.Soft( primary_hue="blue", neutral_hue="slate", ), css=""" .container { max-width: 1400px; margin: auto; } .header { text-align: center; margin-bottom: 2rem; } .status-box { padding: 1rem; border-radius: 8px; background: #f8fafc; } /* Make tables horizontally scrollable */ table { display: block; overflow-x: auto; white-space: nowrap; max-width: 100%; } """ ) as app: gr.Markdown(""" # 🎯 Strike Capital - AI Diligence Automation Upload a Harmonic PDF report to automatically generate a V0 Diligence Document. """) with gr.Row(): with gr.Column(scale=1, min_width=280): gr.Markdown("### 📤 Upload Harmonic Report") pdf_input = gr.File( label="Harmonic PDF Report", file_types=[".pdf"], type="filepath" ) generate_btn = gr.Button( "🚀 Generate V0 Document", variant="primary", size="lg" ) gr.Markdown("### 📊 Status") status_output = gr.Markdown("*Ready to process...*") gr.Markdown("### 📥 Export") with gr.Row(): export_html_btn = gr.Button("📄 HTML", size="sm", variant="secondary") export_pdf_btn = gr.Button("📑 Save as PDF", size="sm", variant="primary") html_download = gr.File(label="HTML Download", visible=False) pdf_download = gr.File(label="PDF Download", visible=False) with gr.Column(scale=4): gr.Markdown("### 📋 Document Preview") with gr.Tabs(): with gr.TabItem("Preview"): html_preview = gr.HTML( value="

Upload a PDF to generate your V0 document...

" ) with gr.TabItem("Markdown"): markdown_output = gr.Textbox( label="Raw Markdown", lines=30, max_lines=50, show_copy_button=True ) # Event handlers generate_btn.click( fn=process_pdf, inputs=[pdf_input], outputs=[markdown_output, html_preview, status_output], show_progress=True ) export_html_btn.click( fn=export_to_html, inputs=[markdown_output], outputs=[html_download] ).then( fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False), inputs=[html_download], outputs=[html_download] ) export_pdf_btn.click( fn=export_to_pdf, inputs=[markdown_output], outputs=[pdf_download] ).then( fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False), inputs=[pdf_download], outputs=[pdf_download] ) gr.Markdown(""" --- *Strike Capital AI Diligence System v0.1 | Powered by GPT-5.1 + Pinecone* """) # For Hugging Face Spaces Docker deployment app.launch( server_name="0.0.0.0", server_port=7860 )