Spaces:

EnesDS
/

strike-capital-dd

Sleeping

App Files Files Community

strike-capital-dd / app.py

EnesDS

Upload app.py

01d7150 verified 29 days ago

raw

history blame contribute delete

14.8 kB

	"""
	Strike Capital AI Diligence Automation - Gradio Application
	V0 Document Generation from Harmonic PDFs.
	"""
	import os
	import re
	import tempfile
	import gradio as gr
	import markdown2
	from fpdf import FPDF

	from config import validate_config
	from pdf_extractor import extract_from_pdf
	from pinecone_store import store_extracted_data, store_v0_document, store_sentence_chunks
	from v0_generator import generate_v0_document

	# Validate configuration on startup
	try:
	validate_config()
	print("[OK] Configuration validated successfully")
	except ValueError as e:
	print(f"[ERROR] Configuration error: {e}")

	# Custom CSS for Strike Capital branding
	STRIKE_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');

	body {
	font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
	line-height: 1.6;
	color: #1a1a1a;
	max-width: 900px;
	margin: 0 auto;
	padding: 40px;
	background: #ffffff;
	}

	h1 {
	color: #0a0a0a;
	font-weight: 700;
	font-size: 28px;
	border-bottom: 3px solid #2563eb;
	padding-bottom: 12px;
	margin-bottom: 24px;
	}

	h2 {
	color: #0a0a0a;
	font-weight: 600;
	font-size: 20px;
	margin-top: 32px;
	margin-bottom: 16px;
	padding-bottom: 8px;
	border-bottom: 1px solid #e5e5e5;
	}

	h3 {
	color: #262626;
	font-weight: 600;
	font-size: 16px;
	margin-top: 24px;
	}

	p {
	margin-bottom: 16px;
	}

	ul, ol {
	margin-bottom: 16px;
	padding-left: 24px;
	}

	li {
	margin-bottom: 8px;
	}

	/* Table wrapper for horizontal scroll */
	.table-wrapper {
	overflow-x: auto;
	margin: 16px 0;
	}

	table {
	width: 100%;
	min-width: 600px;
	border-collapse: collapse;
	font-size: 13px;
	display: block;
	overflow-x: auto;
	}

	th {
	background: #f5f5f5;
	font-weight: 600;
	text-align: left;
	padding: 10px 8px;
	border: 1px solid #e5e5e5;
	white-space: nowrap;
	}

	td {
	padding: 8px;
	border: 1px solid #e5e5e5;
	min-width: 80px;
	}

	tr:nth-child(even) {
	background: #fafafa;
	}

	strong {
	font-weight: 600;
	}

	em {
	color: #525252;
	}

	hr {
	border: none;
	border-top: 1px solid #e5e5e5;
	margin: 24px 0;
	}

	code {
	background: #f5f5f5;
	padding: 2px 6px;
	border-radius: 4px;
	font-size: 14px;
	}

	blockquote {
	border-left: 4px solid #2563eb;
	padding-left: 16px;
	margin: 16px 0;
	color: #525252;
	}

	.header-meta {
	color: #737373;
	font-size: 14px;
	margin-bottom: 24px;
	}

	@media print {
	body {
	padding: 20px;
	}
	h2 {
	page-break-after: avoid;
	}
	}
	"""


	def process_pdf(pdf_file, progress=gr.Progress()):
	"""
	Main processing pipeline: PDF → Extract → Generate V0 → Store.

	Args:
	pdf_file: Uploaded PDF file
	progress: Gradio progress tracker

	Returns:
	Tuple of (markdown_content, html_preview, status_message)
	"""
	if pdf_file is None:
	return "", "<p>Please upload a Harmonic PDF report.</p>", "⚠️ No file uploaded"

	try:
	# Step 1: Extract data from PDF (returns extracted_data AND sentence_chunks)
	progress(0.1, desc="📄 Extracting text and chunking sentences...")

	def extraction_progress(batch, total):
	progress(0.1 + (0.3 * batch / total), desc=f"🔍 Processing batch {batch}/{total}...")

	extracted_data, sentence_chunks = extract_from_pdf(pdf_file.name, progress_callback=extraction_progress)

	# Get company name
	company_name = extracted_data.get("company_info", {}).get("name", "Unknown Company")
	progress(0.4, desc=f"✓ Extracted data for {company_name}")

	# Step 2: Store sentence chunks in Pinecone (one vector per sentence)
	progress(0.45, desc=f"💾 Storing {len(sentence_chunks)} sentence chunks in Pinecone...")
	chunks_result = store_sentence_chunks(company_name, sentence_chunks)

	# Step 3: Store extracted sections in Pinecone
	progress(0.55, desc="💾 Storing extracted sections...")
	store_result = store_extracted_data(company_name, extracted_data)

	# Step 4: Generate V0 document
	def generation_progress(step, total, section_name):
	progress(0.6 + (0.35 * step / total), desc=f"✍️ Generating: {section_name}...")

	v0_document = generate_v0_document(
	extracted_data,
	company_name,
	progress_callback=generation_progress
	)

	# Step 5: Store V0 document
	progress(0.95, desc="💾 Saving V0 document...")
	store_v0_document(company_name, v0_document)

	# Convert to HTML for preview
	html_content = markdown2.markdown(
	v0_document,
	extras=["tables", "fenced-code-blocks", "header-ids"]
	)
	html_preview = f"<style>{STRIKE_CSS}</style>{html_content}"

	progress(1.0, desc="✓ Complete!")

	status = f"✅ Successfully generated V0 for {company_name}\n\n"
	status += f"- Sentence chunks stored: {chunks_result.get('chunks_stored', 'N/A')}\n"
	status += f"- Sections stored: {store_result.get('sections_stored', 'N/A')}\n"
	status += f"- Document version: v0"

	return v0_document, html_preview, status

	except Exception as e:
	error_msg = f"❌ Error: {str(e)}"
	return "", f"<p style='color: red;'>{error_msg}</p>", error_msg


	def export_to_html(markdown_content, auto_print=False):
	"""Export markdown to styled HTML file with optional auto-print dialog."""
	if not markdown_content:
	return None

	html_content = markdown2.markdown(
	markdown_content,
	extras=["tables", "fenced-code-blocks", "header-ids"]
	)

	# Auto-print script (opens print dialog when page loads)
	print_script = """
	<script>
	window.onload = function() {
	// Small delay to ensure styles are loaded
	setTimeout(function() {
	window.print();
	}, 500);
	}
	</script>
	""" if auto_print else ""

	full_html = f"""<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<title>Strike Capital - V0 Diligence Document</title>
	<style>{STRIKE_CSS}</style>
	{print_script}
	</head>
	<body>
	{html_content}
	</body>
	</html>"""

	# Save to temp file
	with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
	f.write(full_html)
	return f.name


	def export_to_pdf(markdown_content):
	"""Export markdown to a real PDF file using fpdf2."""
	if not markdown_content:
	return None

	# Create PDF with Unicode support
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()

	# Use built-in fonts (Helvetica for body, Courier for code)
	# Note: For full Unicode support, you'd need to add a Unicode font

	# Process markdown content line by line
	lines = markdown_content.split('\n')

	for line in lines:
	line = line.strip()

	if not line:
	pdf.ln(4)
	continue

	# Handle headers
	if line.startswith('# '):
	pdf.set_font('Helvetica', 'B', 20)
	pdf.set_text_color(10, 10, 10)
	text = line[2:].strip()
	pdf.multi_cell(0, 10, clean_text_for_pdf(text))
	pdf.ln(2)
	# Add underline
	pdf.set_draw_color(37, 99, 235)
	pdf.set_line_width(0.8)
	pdf.line(10, pdf.get_y(), 200, pdf.get_y())
	pdf.ln(6)

	elif line.startswith('## '):
	pdf.set_font('Helvetica', 'B', 16)
	pdf.set_text_color(10, 10, 10)
	text = line[3:].strip()
	pdf.ln(6)
	pdf.multi_cell(0, 8, clean_text_for_pdf(text))
	pdf.ln(2)

	elif line.startswith('### '):
	pdf.set_font('Helvetica', 'B', 13)
	pdf.set_text_color(38, 38, 38)
	text = line[4:].strip()
	pdf.ln(4)
	pdf.multi_cell(0, 7, clean_text_for_pdf(text))
	pdf.ln(2)

	elif line.startswith('- ') or line.startswith('* '):
	pdf.set_font('Helvetica', '', 11)
	pdf.set_text_color(26, 26, 26)
	text = line[2:].strip()
	# Bullet point
	pdf.cell(8, 6, chr(149)) # bullet character
	pdf.multi_cell(0, 6, clean_text_for_pdf(text))

	elif re.match(r'^\d+\.\s', line):
	pdf.set_font('Helvetica', '', 11)
	pdf.set_text_color(26, 26, 26)
	pdf.multi_cell(0, 6, clean_text_for_pdf(line))

	elif line.startswith('') and line.endswith(''):
	pdf.set_font('Helvetica', 'B', 11)
	pdf.set_text_color(26, 26, 26)
	text = line.strip('*')
	pdf.multi_cell(0, 6, clean_text_for_pdf(text))

	elif line.startswith('---') or line.startswith('***'):
	pdf.ln(4)
	pdf.set_draw_color(229, 229, 229)
	pdf.set_line_width(0.3)
	pdf.line(10, pdf.get_y(), 200, pdf.get_y())
	pdf.ln(4)

	else:
	# Regular paragraph
	pdf.set_font('Helvetica', '', 11)
	pdf.set_text_color(26, 26, 26)
	# Handle inline bold (text)
	text = clean_text_for_pdf(line)
	pdf.multi_cell(0, 6, text)

	# Save to temp file
	with tempfile.NamedTemporaryFile(mode='wb', suffix='.pdf', delete=False) as f:
	pdf.output(f.name)
	return f.name


	def clean_text_for_pdf(text):
	"""Clean text for PDF output - remove markdown formatting and handle special chars."""
	# Remove bold markers
	text = re.sub(r'\\(.+?)\\', r'\1', text)
	# Remove italic markers
	text = re.sub(r'\(.+?)\', r'\1', text)
	# Remove inline code markers
	text = re.sub(r'`(.+?)`', r'\1', text)
	# Replace special Unicode chars that might cause issues
	text = text.replace('→', '->')
	text = text.replace('⚠', '[!]')
	text = text.replace('✓', '[OK]')
	text = text.replace('✅', '[OK]')
	text = text.replace('❌', '[X]')
	text = text.replace('📄', '')
	text = text.replace('📊', '')
	text = text.replace('📋', '')
	text = text.replace('💾', '')
	text = text.replace('🔍', '')
	text = text.replace('✍️', '')
	text = text.replace('🚀', '')
	text = text.replace('📤', '')
	text = text.replace('📥', '')
	text = text.replace('🎯', '')
	# Encode to latin-1 (what FPDF uses by default), replacing unknown chars
	text = text.encode('latin-1', errors='replace').decode('latin-1')
	return text


	# Build Gradio Interface
	with gr.Blocks(
	title="Strike Capital - AI Diligence",
	theme=gr.themes.Soft(
	primary_hue="blue",
	neutral_hue="slate",
	),
	css="""
	.container { max-width: 1400px; margin: auto; }
	.header { text-align: center; margin-bottom: 2rem; }
	.status-box { padding: 1rem; border-radius: 8px; background: #f8fafc; }
	/* Make tables horizontally scrollable */
	table { display: block; overflow-x: auto; white-space: nowrap; max-width: 100%; }
	"""
	) as app:

	gr.Markdown("""
	# 🎯 Strike Capital - AI Diligence Automation

	Upload a Harmonic PDF report to automatically generate a V0 Diligence Document.
	""")

	with gr.Row():
	with gr.Column(scale=1, min_width=280):
	gr.Markdown("### 📤 Upload Harmonic Report")
	pdf_input = gr.File(
	label="Harmonic PDF Report",
	file_types=[".pdf"],
	type="filepath"
	)

	generate_btn = gr.Button(
	"🚀 Generate V0 Document",
	variant="primary",
	size="lg"
	)

	gr.Markdown("### 📊 Status")
	status_output = gr.Markdown("Ready to process...")

	gr.Markdown("### 📥 Export")
	with gr.Row():
	export_html_btn = gr.Button("📄 HTML", size="sm", variant="secondary")
	export_pdf_btn = gr.Button("📑 Save as PDF", size="sm", variant="primary")
	html_download = gr.File(label="HTML Download", visible=False)
	pdf_download = gr.File(label="PDF Download", visible=False)

	with gr.Column(scale=4):
	gr.Markdown("### 📋 Document Preview")

	with gr.Tabs():
	with gr.TabItem("Preview"):
	html_preview = gr.HTML(
	value="<p style='color: #666; text-align: center; padding: 40px;'>Upload a PDF to generate your V0 document...</p>"
	)

	with gr.TabItem("Markdown"):
	markdown_output = gr.Textbox(
	label="Raw Markdown",
	lines=30,
	max_lines=50,
	show_copy_button=True
	)

	# Event handlers
	generate_btn.click(
	fn=process_pdf,
	inputs=[pdf_input],
	outputs=[markdown_output, html_preview, status_output],
	show_progress=True
	)

	export_html_btn.click(
	fn=export_to_html,
	inputs=[markdown_output],
	outputs=[html_download]
	).then(
	fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False),
	inputs=[html_download],
	outputs=[html_download]
	)

	export_pdf_btn.click(
	fn=export_to_pdf,
	inputs=[markdown_output],
	outputs=[pdf_download]
	).then(
	fn=lambda x: gr.update(visible=True) if x else gr.update(visible=False),
	inputs=[pdf_download],
	outputs=[pdf_download]
	)

	gr.Markdown("""
	---
	Strike Capital AI Diligence System v0.1 \| Powered by GPT-5.1 + Pinecone
	""")


	# For Hugging Face Spaces Docker deployment
	app.launch(
	server_name="0.0.0.0",
	server_port=7860
	)