Spaces:
Sleeping
Sleeping
| """ | |
| Strike Capital AI Diligence Automation - Gradio Application | |
| V0 Document Generation from Harmonic PDFs. | |
| """ | |
| import os | |
| import re | |
| import tempfile | |
| import gradio as gr | |
| import markdown2 | |
| from fpdf import FPDF | |
| from config import validate_config | |
| from pdf_extractor import extract_from_pdf | |
| from pinecone_store import store_extracted_data, store_v0_document, store_sentence_chunks | |
| from v0_generator import generate_v0_document | |
| # Validate configuration on startup | |
| try: | |
| validate_config() | |
| print("[OK] Configuration validated successfully") | |
| except ValueError as e: | |
| print(f"[ERROR] Configuration error: {e}") | |
| # Custom CSS for Strike Capital branding | |
| STRIKE_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); | |
| body { | |
| font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; | |
| line-height: 1.6; | |
| color: #1a1a1a; | |
| max-width: 900px; | |
| margin: 0 auto; | |
| padding: 40px; | |
| background: #ffffff; | |
| } | |
| h1 { | |
| color: #0a0a0a; | |
| font-weight: 700; | |
| font-size: 28px; | |
| border-bottom: 3px solid #2563eb; | |
| padding-bottom: 12px; | |
| margin-bottom: 24px; | |
| } | |
| h2 { | |
| color: #0a0a0a; | |
| font-weight: 600; | |
| font-size: 20px; | |
| margin-top: 32px; | |
| margin-bottom: 16px; | |
| padding-bottom: 8px; | |
| border-bottom: 1px solid #e5e5e5; | |
| } | |
| h3 { | |
| color: #262626; | |
| font-weight: 600; | |
| font-size: 16px; | |
| margin-top: 24px; | |
| } | |
| p { | |
| margin-bottom: 16px; | |
| } | |
| ul, ol { | |
| margin-bottom: 16px; | |
| padding-left: 24px; | |
| } | |
| li { | |
| margin-bottom: 8px; | |
| } | |
| /* Table wrapper for horizontal scroll */ | |
| .table-wrapper { | |
| overflow-x: auto; | |
| margin: 16px 0; | |
| } | |
| table { | |
| width: 100%; | |
| min-width: 600px; | |
| border-collapse: collapse; | |
| font-size: 13px; | |
| display: block; | |
| overflow-x: auto; | |
| } | |
| th { | |
| background: #f5f5f5; | |
| font-weight: 600; | |
| text-align: left; | |
| padding: 10px 8px; | |
| border: 1px solid #e5e5e5; | |
| white-space: nowrap; | |
| } | |
| td { | |
| padding: 8px; | |
| border: 1px solid #e5e5e5; | |
| min-width: 80px; | |
| } | |
| tr:nth-child(even) { | |
| background: #fafafa; | |
| } | |
| strong { | |
| font-weight: 600; | |
| } | |
| em { | |
| color: #525252; | |
| } | |
| hr { | |
| border: none; | |
| border-top: 1px solid #e5e5e5; | |
| margin: 24px 0; | |
| } | |
| code { | |
| background: #f5f5f5; | |
| padding: 2px 6px; | |
| border-radius: 4px; | |
| font-size: 14px; | |
| } | |
| blockquote { | |
| border-left: 4px solid #2563eb; | |
| padding-left: 16px; | |
| margin: 16px 0; | |
| color: #525252; | |
| } | |
| .header-meta { | |
| color: #737373; | |
| font-size: 14px; | |
| margin-bottom: 24px; | |
| } | |
| @media print { | |
| body { | |
| padding: 20px; | |
| } | |
| h2 { | |
| page-break-after: avoid; | |
| } | |
| } | |
| """ | |
| def process_pdf(pdf_file, progress=gr.Progress()): | |
| """ | |
| Main processing pipeline: PDF β Extract β Generate V0 β Store. | |
| Args: | |
| pdf_file: Uploaded PDF file | |
| progress: Gradio progress tracker | |
| Returns: | |
| Tuple of (markdown_content, html_preview, status_message) | |
| """ | |
| if pdf_file is None: | |
| return "", "<p>Please upload a Harmonic PDF report.</p>", "β οΈ No file uploaded" | |
| try: | |
| # Step 1: Extract data from PDF (returns extracted_data AND sentence_chunks) | |
| progress(0.1, desc="π Extracting text and chunking sentences...") | |
| def extraction_progress(batch, total): | |
| progress(0.1 + (0.3 * batch / total), desc=f"π Processing batch {batch}/{total}...") | |
| extracted_data, sentence_chunks = extract_from_pdf(pdf_file.name, progress_callback=extraction_progress) | |
| # Get company name | |
| company_name = extracted_data.get("company_info", {}).get("name", "Unknown Company") | |
| progress(0.4, desc=f"β Extracted data for {company_name}") | |
| # Step 2: Store sentence chunks in Pinecone (one vector per sentence) | |
| progress(0.45, desc=f"πΎ Storing {len(sentence_chunks)} sentence chunks in Pinecone...") | |
| chunks_result = store_sentence_chunks(company_name, sentence_chunks) | |
| # Step 3: Store extracted sections in Pinecone | |
| progress(0.55, desc="πΎ Storing extracted sections...") | |
| store_result = store_extracted_data(company_name, extracted_data) | |
| # Step 4: Generate V0 document | |
| def generation_progress(step, total, section_name): | |
| progress(0.6 + (0.35 * step / total), desc=f"βοΈ Generating: {section_name}...") | |
| v0_document = generate_v0_document( | |
| extracted_data, | |
| company_name, | |
| progress_callback=generation_progress | |
| ) | |
| # Step 5: Store V0 document | |
| progress(0.95, desc="πΎ Saving V0 document...") | |
| store_v0_document(company_name, v0_document) | |
| # Convert to HTML for preview | |
| html_content = markdown2.markdown( | |
| v0_document, | |
| extras=["tables", "fenced-code-blocks", "header-ids"] | |
| ) | |
| html_preview = f"<style>{STRIKE_CSS}</style>{html_content}" | |
| progress(1.0, desc="β Complete!") | |
| status = f"β Successfully generated V0 for **{company_name}**\n\n" | |
| status += f"- Sentence chunks stored: {chunks_result.get('chunks_stored', 'N/A')}\n" | |
| status += f"- Sections stored: {store_result.get('sections_stored', 'N/A')}\n" | |
| status += f"- Document version: v0" | |
| return v0_document, html_preview, status | |
| except Exception as e: | |
| error_msg = f"β Error: {str(e)}" | |
| return "", f"<p style='color: red;'>{error_msg}</p>", error_msg | |
| def export_to_html(markdown_content, auto_print=False): | |
| """Export markdown to styled HTML file with optional auto-print dialog.""" | |
| if not markdown_content: | |
| return None | |
| html_content = markdown2.markdown( | |
| markdown_content, | |
| extras=["tables", "fenced-code-blocks", "header-ids"] | |
| ) | |
| # Auto-print script (opens print dialog when page loads) | |
| print_script = """ | |
| <script> | |
| window.onload = function() { | |
| // Small delay to ensure styles are loaded | |
| setTimeout(function() { | |
| window.print(); | |
| }, 500); | |
| } | |
| </script> | |
| """ if auto_print else "" | |
| full_html = f"""<!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>Strike Capital - V0 Diligence Document</title> | |
| <style>{STRIKE_CSS}</style> | |
| {print_script} | |
| </head> | |
| <body> | |
| {html_content} | |
| </body> | |
| </html>""" | |
| # Save to temp file | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f: | |
| f.write(full_html) | |
| return f.name | |
| def export_to_pdf(markdown_content): | |
| """Export markdown to a real PDF file using fpdf2.""" | |
| if not markdown_content: | |
| return None | |
| # Create PDF with Unicode support | |
| pdf = FPDF() | |
| pdf.set_auto_page_break(auto=True, margin=15) | |
| pdf.add_page() | |
| # Use built-in fonts (Helvetica for body, Courier for code) | |
| # Note: For full Unicode support, you'd need to add a Unicode font | |
| # Process markdown content line by line | |
| lines = markdown_content.split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| pdf.ln(4) | |
| continue | |
| # Handle headers | |
| if line.startswith('# '): | |
| pdf.set_font('Helvetica', 'B', 20) | |
| pdf.set_text_color(10, 10, 10) | |
| text = line[2:].strip() | |
| pdf.multi_cell(0, 10, clean_text_for_pdf(text)) | |
| pdf.ln(2) | |
| # Add underline | |
| pdf.set_draw_color(37, 99, 235) | |
| pdf.set_line_width(0.8) | |
| pdf.line(10, pdf.get_y(), 200, pdf.get_y()) | |
| pdf.ln(6) | |
| elif line.startswith('## '): | |
| pdf.set_font('Helvetica', 'B', 16) | |
| pdf.set_text_color(10, 10, 10) | |
| text = line[3:].strip() | |
| pdf.ln(6) | |
| pdf.multi_cell(0, 8, clean_text_for_pdf(text)) | |
| pdf.ln(2) | |
| elif line.startswith('### '): | |
| pdf.set_font('Helvetica', 'B', 13) | |
| pdf.set_text_color(38, 38, 38) | |
| text = line[4:].strip() | |
| pdf.ln(4) | |
| pdf.multi_cell(0, 7, clean_text_for_pdf(text)) | |
| pdf.ln(2) | |
| elif line.startswith('- ') or line.startswith('* '): | |
| pdf.set_font('Helvetica', '', 11) | |
| pdf.set_text_color(26, 26, 26) | |
| text = line[2:].strip() | |
| # Bullet point | |
| pdf.cell(8, 6, chr(149)) # bullet character | |
| pdf.multi_cell(0, 6, clean_text_for_pdf(text)) | |
| elif re.match(r'^\d+\.\s', line): | |
| pdf.set_font('Helvetica', '', 11) | |
| pdf.set_text_color(26, 26, 26) | |
| pdf.multi_cell(0, 6, clean_text_for_pdf(line)) | |
| elif line.startswith('**') and line.endswith('**'): | |
| pdf.set_font('Helvetica', 'B', 11) | |
| pdf.set_text_color(26, 26, 26) | |
| text = line.strip('*') | |
| pdf.multi_cell(0, 6, clean_text_for_pdf(text)) | |
| elif line.startswith('---') or line.startswith('***'): | |
| pdf.ln(4) | |
| pdf.set_draw_color(229, 229, 229) | |
| pdf.set_line_width(0.3) | |
| pdf.line(10, pdf.get_y(), 200, pdf.get_y()) | |
| pdf.ln(4) | |
| else: | |
| # Regular paragraph | |
| pdf.set_font('Helvetica', '', 11) | |
| pdf.set_text_color(26, 26, 26) | |
| # Handle inline bold (**text**) | |
| text = clean_text_for_pdf(line) | |
| pdf.multi_cell(0, 6, text) | |
| # Save to temp file | |
| with tempfile.NamedTemporaryFile(mode='wb', suffix='.pdf', delete=False) as f: | |
| pdf.output(f.name) | |
| return f.name | |
| def clean_text_for_pdf(text): | |
| """Clean text for PDF output - remove markdown formatting and handle special chars.""" | |
| # Remove bold markers | |
| text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) | |
| # Remove italic markers | |
| text = re.sub(r'\*(.+?)\*', r'\1', text) | |
| # Remove inline code markers | |
| text = re.sub(r'`(.+?)`', r'\1', text) | |
| # Replace special Unicode chars that might cause issues | |
| text = text.replace('β', '->') | |
| text = text.replace('β ', '[!]') | |
| text = text.replace('β', '[OK]') | |
| text = text.replace('β ', '[OK]') | |
| text = text.replace('β', '[X]') | |
| text = text.replace('π', '') | |
| text = text.replace('π', '') | |
| text = text.replace('π', '') | |
| text = text.replace('πΎ', '') | |
| text = text.replace('π', '') | |
| text = text.replace('βοΈ', '') | |
| text = text.replace('π', '') | |
| text = text.replace('π€', '') | |
| text = text.replace('π₯', '') | |
| text = text.replace('π―', '') | |
| # Encode to latin-1 (what FPDF uses by default), replacing unknown chars | |
| text = text.encode('latin-1', errors='replace').decode('latin-1') | |
| return text | |
| # Build Gradio Interface | |
| with gr.Blocks( | |
| title="Strike Capital - AI Diligence", | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| neutral_hue="slate", | |
| ), | |
| css=""" | |
| .container { max-width: 1400px; margin: auto; } | |
| .header { text-align: center; margin-bottom: 2rem; } | |
| .status-box { padding: 1rem; border-radius: 8px; background: #f8fafc; } | |
| /* Make tables horizontally scrollable */ | |
| table { display: block; overflow-x: auto; white-space: nowrap; max-width: 100%; } | |
| """ | |
| ) as app: | |
| gr.Markdown(""" | |
| # π― Strike Capital - AI Diligence Automation | |
| Upload a Harmonic PDF report to automatically generate a V0 Diligence Document. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=280): | |
| gr.Markdown("### π€ Upload Harmonic Report") | |
| pdf_input = gr.File( | |
| label="Harmonic PDF Report", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| generate_btn = gr.Button( | |
| "π Generate V0 Document", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| gr.Markdown("### π Status") | |
| status_output = gr.Markdown("*Ready to process...*") | |
| gr.Markdown("### π₯ Export") | |
| with gr.Row(): | |
| export_html_btn = gr.Button("π HTML", size="sm", variant="secondary") | |
| export_pdf_btn = gr.Button("π Save as PDF", size="sm", variant="primary") | |
| html_download = gr.File(label="HTML Download", visible=False) | |
| pdf_download = gr.File(label="PDF Download", visible=False) | |
| with gr.Column(scale=4): | |
| gr.Markdown("### π Document Preview") | |
| with gr.Tabs(): | |
| with gr.TabItem("Preview"): | |
| html_preview = gr.HTML( | |
| value="<p style='color: #666; text-align: center; padding: 40px;'>Upload a PDF to generate your V0 document...</p>" | |
| ) | |
| with gr.TabItem("Markdown"): | |
| markdown_output = gr.Textbox( | |
| label="Raw Markdown", | |
| lines=30, | |
| max_lines=50, | |
| show_copy_button=True | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=process_pdf, | |
| inputs=[pdf_input], | |
| outputs=[markdown_output, html_preview, status_output], | |
| show_progress=True | |
| ) | |
| export_html_btn.click( | |
| fn=export_to_html, | |
| inputs=[markdown_output], | |
| outputs=[html_download] | |
| ).then( | |
| fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False), | |
| inputs=[html_download], | |
| outputs=[html_download] | |
| ) | |
| export_pdf_btn.click( | |
| fn=export_to_pdf, | |
| inputs=[markdown_output], | |
| outputs=[pdf_download] | |
| ).then( | |
| fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False), | |
| inputs=[pdf_download], | |
| outputs=[pdf_download] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| *Strike Capital AI Diligence System v0.1 | Powered by GPT-5.1 + Pinecone* | |
| """) | |
| # For Hugging Face Spaces Docker deployment | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860 | |
| ) | |