EnesDS's picture
Upload app.py
01d7150 verified
"""
Strike Capital AI Diligence Automation - Gradio Application
V0 Document Generation from Harmonic PDFs.
"""
import os
import re
import tempfile
import gradio as gr
import markdown2
from fpdf import FPDF
from config import validate_config
from pdf_extractor import extract_from_pdf
from pinecone_store import store_extracted_data, store_v0_document, store_sentence_chunks
from v0_generator import generate_v0_document
# Validate configuration on startup
try:
validate_config()
print("[OK] Configuration validated successfully")
except ValueError as e:
print(f"[ERROR] Configuration error: {e}")
# Custom CSS for Strike Capital branding
STRIKE_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
body {
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
line-height: 1.6;
color: #1a1a1a;
max-width: 900px;
margin: 0 auto;
padding: 40px;
background: #ffffff;
}
h1 {
color: #0a0a0a;
font-weight: 700;
font-size: 28px;
border-bottom: 3px solid #2563eb;
padding-bottom: 12px;
margin-bottom: 24px;
}
h2 {
color: #0a0a0a;
font-weight: 600;
font-size: 20px;
margin-top: 32px;
margin-bottom: 16px;
padding-bottom: 8px;
border-bottom: 1px solid #e5e5e5;
}
h3 {
color: #262626;
font-weight: 600;
font-size: 16px;
margin-top: 24px;
}
p {
margin-bottom: 16px;
}
ul, ol {
margin-bottom: 16px;
padding-left: 24px;
}
li {
margin-bottom: 8px;
}
/* Table wrapper for horizontal scroll */
.table-wrapper {
overflow-x: auto;
margin: 16px 0;
}
table {
width: 100%;
min-width: 600px;
border-collapse: collapse;
font-size: 13px;
display: block;
overflow-x: auto;
}
th {
background: #f5f5f5;
font-weight: 600;
text-align: left;
padding: 10px 8px;
border: 1px solid #e5e5e5;
white-space: nowrap;
}
td {
padding: 8px;
border: 1px solid #e5e5e5;
min-width: 80px;
}
tr:nth-child(even) {
background: #fafafa;
}
strong {
font-weight: 600;
}
em {
color: #525252;
}
hr {
border: none;
border-top: 1px solid #e5e5e5;
margin: 24px 0;
}
code {
background: #f5f5f5;
padding: 2px 6px;
border-radius: 4px;
font-size: 14px;
}
blockquote {
border-left: 4px solid #2563eb;
padding-left: 16px;
margin: 16px 0;
color: #525252;
}
.header-meta {
color: #737373;
font-size: 14px;
margin-bottom: 24px;
}
@media print {
body {
padding: 20px;
}
h2 {
page-break-after: avoid;
}
}
"""
def process_pdf(pdf_file, progress=gr.Progress()):
"""
Main processing pipeline: PDF β†’ Extract β†’ Generate V0 β†’ Store.
Args:
pdf_file: Uploaded PDF file
progress: Gradio progress tracker
Returns:
Tuple of (markdown_content, html_preview, status_message)
"""
if pdf_file is None:
return "", "<p>Please upload a Harmonic PDF report.</p>", "⚠️ No file uploaded"
try:
# Step 1: Extract data from PDF (returns extracted_data AND sentence_chunks)
progress(0.1, desc="πŸ“„ Extracting text and chunking sentences...")
def extraction_progress(batch, total):
progress(0.1 + (0.3 * batch / total), desc=f"πŸ” Processing batch {batch}/{total}...")
extracted_data, sentence_chunks = extract_from_pdf(pdf_file.name, progress_callback=extraction_progress)
# Get company name
company_name = extracted_data.get("company_info", {}).get("name", "Unknown Company")
progress(0.4, desc=f"βœ“ Extracted data for {company_name}")
# Step 2: Store sentence chunks in Pinecone (one vector per sentence)
progress(0.45, desc=f"πŸ’Ύ Storing {len(sentence_chunks)} sentence chunks in Pinecone...")
chunks_result = store_sentence_chunks(company_name, sentence_chunks)
# Step 3: Store extracted sections in Pinecone
progress(0.55, desc="πŸ’Ύ Storing extracted sections...")
store_result = store_extracted_data(company_name, extracted_data)
# Step 4: Generate V0 document
def generation_progress(step, total, section_name):
progress(0.6 + (0.35 * step / total), desc=f"✍️ Generating: {section_name}...")
v0_document = generate_v0_document(
extracted_data,
company_name,
progress_callback=generation_progress
)
# Step 5: Store V0 document
progress(0.95, desc="πŸ’Ύ Saving V0 document...")
store_v0_document(company_name, v0_document)
# Convert to HTML for preview
html_content = markdown2.markdown(
v0_document,
extras=["tables", "fenced-code-blocks", "header-ids"]
)
html_preview = f"<style>{STRIKE_CSS}</style>{html_content}"
progress(1.0, desc="βœ“ Complete!")
status = f"βœ… Successfully generated V0 for **{company_name}**\n\n"
status += f"- Sentence chunks stored: {chunks_result.get('chunks_stored', 'N/A')}\n"
status += f"- Sections stored: {store_result.get('sections_stored', 'N/A')}\n"
status += f"- Document version: v0"
return v0_document, html_preview, status
except Exception as e:
error_msg = f"❌ Error: {str(e)}"
return "", f"<p style='color: red;'>{error_msg}</p>", error_msg
def export_to_html(markdown_content, auto_print=False):
"""Export markdown to styled HTML file with optional auto-print dialog."""
if not markdown_content:
return None
html_content = markdown2.markdown(
markdown_content,
extras=["tables", "fenced-code-blocks", "header-ids"]
)
# Auto-print script (opens print dialog when page loads)
print_script = """
<script>
window.onload = function() {
// Small delay to ensure styles are loaded
setTimeout(function() {
window.print();
}, 500);
}
</script>
""" if auto_print else ""
full_html = f"""<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Strike Capital - V0 Diligence Document</title>
<style>{STRIKE_CSS}</style>
{print_script}
</head>
<body>
{html_content}
</body>
</html>"""
# Save to temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
f.write(full_html)
return f.name
def export_to_pdf(markdown_content):
"""Export markdown to a real PDF file using fpdf2."""
if not markdown_content:
return None
# Create PDF with Unicode support
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
# Use built-in fonts (Helvetica for body, Courier for code)
# Note: For full Unicode support, you'd need to add a Unicode font
# Process markdown content line by line
lines = markdown_content.split('\n')
for line in lines:
line = line.strip()
if not line:
pdf.ln(4)
continue
# Handle headers
if line.startswith('# '):
pdf.set_font('Helvetica', 'B', 20)
pdf.set_text_color(10, 10, 10)
text = line[2:].strip()
pdf.multi_cell(0, 10, clean_text_for_pdf(text))
pdf.ln(2)
# Add underline
pdf.set_draw_color(37, 99, 235)
pdf.set_line_width(0.8)
pdf.line(10, pdf.get_y(), 200, pdf.get_y())
pdf.ln(6)
elif line.startswith('## '):
pdf.set_font('Helvetica', 'B', 16)
pdf.set_text_color(10, 10, 10)
text = line[3:].strip()
pdf.ln(6)
pdf.multi_cell(0, 8, clean_text_for_pdf(text))
pdf.ln(2)
elif line.startswith('### '):
pdf.set_font('Helvetica', 'B', 13)
pdf.set_text_color(38, 38, 38)
text = line[4:].strip()
pdf.ln(4)
pdf.multi_cell(0, 7, clean_text_for_pdf(text))
pdf.ln(2)
elif line.startswith('- ') or line.startswith('* '):
pdf.set_font('Helvetica', '', 11)
pdf.set_text_color(26, 26, 26)
text = line[2:].strip()
# Bullet point
pdf.cell(8, 6, chr(149)) # bullet character
pdf.multi_cell(0, 6, clean_text_for_pdf(text))
elif re.match(r'^\d+\.\s', line):
pdf.set_font('Helvetica', '', 11)
pdf.set_text_color(26, 26, 26)
pdf.multi_cell(0, 6, clean_text_for_pdf(line))
elif line.startswith('**') and line.endswith('**'):
pdf.set_font('Helvetica', 'B', 11)
pdf.set_text_color(26, 26, 26)
text = line.strip('*')
pdf.multi_cell(0, 6, clean_text_for_pdf(text))
elif line.startswith('---') or line.startswith('***'):
pdf.ln(4)
pdf.set_draw_color(229, 229, 229)
pdf.set_line_width(0.3)
pdf.line(10, pdf.get_y(), 200, pdf.get_y())
pdf.ln(4)
else:
# Regular paragraph
pdf.set_font('Helvetica', '', 11)
pdf.set_text_color(26, 26, 26)
# Handle inline bold (**text**)
text = clean_text_for_pdf(line)
pdf.multi_cell(0, 6, text)
# Save to temp file
with tempfile.NamedTemporaryFile(mode='wb', suffix='.pdf', delete=False) as f:
pdf.output(f.name)
return f.name
def clean_text_for_pdf(text):
"""Clean text for PDF output - remove markdown formatting and handle special chars."""
# Remove bold markers
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
# Remove italic markers
text = re.sub(r'\*(.+?)\*', r'\1', text)
# Remove inline code markers
text = re.sub(r'`(.+?)`', r'\1', text)
# Replace special Unicode chars that might cause issues
text = text.replace('β†’', '->')
text = text.replace('⚠', '[!]')
text = text.replace('βœ“', '[OK]')
text = text.replace('βœ…', '[OK]')
text = text.replace('❌', '[X]')
text = text.replace('πŸ“„', '')
text = text.replace('πŸ“Š', '')
text = text.replace('πŸ“‹', '')
text = text.replace('πŸ’Ύ', '')
text = text.replace('πŸ”', '')
text = text.replace('✍️', '')
text = text.replace('πŸš€', '')
text = text.replace('πŸ“€', '')
text = text.replace('πŸ“₯', '')
text = text.replace('🎯', '')
# Encode to latin-1 (what FPDF uses by default), replacing unknown chars
text = text.encode('latin-1', errors='replace').decode('latin-1')
return text
# Build Gradio Interface
with gr.Blocks(
title="Strike Capital - AI Diligence",
theme=gr.themes.Soft(
primary_hue="blue",
neutral_hue="slate",
),
css="""
.container { max-width: 1400px; margin: auto; }
.header { text-align: center; margin-bottom: 2rem; }
.status-box { padding: 1rem; border-radius: 8px; background: #f8fafc; }
/* Make tables horizontally scrollable */
table { display: block; overflow-x: auto; white-space: nowrap; max-width: 100%; }
"""
) as app:
gr.Markdown("""
# 🎯 Strike Capital - AI Diligence Automation
Upload a Harmonic PDF report to automatically generate a V0 Diligence Document.
""")
with gr.Row():
with gr.Column(scale=1, min_width=280):
gr.Markdown("### πŸ“€ Upload Harmonic Report")
pdf_input = gr.File(
label="Harmonic PDF Report",
file_types=[".pdf"],
type="filepath"
)
generate_btn = gr.Button(
"πŸš€ Generate V0 Document",
variant="primary",
size="lg"
)
gr.Markdown("### πŸ“Š Status")
status_output = gr.Markdown("*Ready to process...*")
gr.Markdown("### πŸ“₯ Export")
with gr.Row():
export_html_btn = gr.Button("πŸ“„ HTML", size="sm", variant="secondary")
export_pdf_btn = gr.Button("πŸ“‘ Save as PDF", size="sm", variant="primary")
html_download = gr.File(label="HTML Download", visible=False)
pdf_download = gr.File(label="PDF Download", visible=False)
with gr.Column(scale=4):
gr.Markdown("### πŸ“‹ Document Preview")
with gr.Tabs():
with gr.TabItem("Preview"):
html_preview = gr.HTML(
value="<p style='color: #666; text-align: center; padding: 40px;'>Upload a PDF to generate your V0 document...</p>"
)
with gr.TabItem("Markdown"):
markdown_output = gr.Textbox(
label="Raw Markdown",
lines=30,
max_lines=50,
show_copy_button=True
)
# Event handlers
generate_btn.click(
fn=process_pdf,
inputs=[pdf_input],
outputs=[markdown_output, html_preview, status_output],
show_progress=True
)
export_html_btn.click(
fn=export_to_html,
inputs=[markdown_output],
outputs=[html_download]
).then(
fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False),
inputs=[html_download],
outputs=[html_download]
)
export_pdf_btn.click(
fn=export_to_pdf,
inputs=[markdown_output],
outputs=[pdf_download]
).then(
fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False),
inputs=[pdf_download],
outputs=[pdf_download]
)
gr.Markdown("""
---
*Strike Capital AI Diligence System v0.1 | Powered by GPT-5.1 + Pinecone*
""")
# For Hugging Face Spaces Docker deployment
app.launch(
server_name="0.0.0.0",
server_port=7860
)