SimranShaikh commited on
Commit
267cab6
·
verified ·
1 Parent(s): b8bcf74
Files changed (1) hide show
  1. src/streamlit_app.py +424 -690
src/streamlit_app.py CHANGED
@@ -1,741 +1,475 @@
1
- import streamlit as st
2
- import os
3
- import tempfile
4
-
5
- # Fix cache permission issues in HF Spaces
6
- os.environ['TRANSFORMERS_CACHE'] = tempfile.gettempdir()
7
- os.environ['HF_HOME'] = tempfile.gettempdir()
8
- os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
9
-
10
- import torch
11
  import PyPDF2
12
- import docx
 
13
  import pandas as pd
14
- from sentence_transformers import SentenceTransformer
15
- import chromadb
16
- from chromadb.config import Settings
17
- import tempfile
18
- import uuid
19
  import re
20
- from datetime import datetime
 
 
 
21
 
22
- # Page config
23
- st.set_page_config(
24
- page_title="FinanceGPT - Enterprise AI Assistant",
25
- page_icon="💰",
26
- layout="wide",
27
- initial_sidebar_state="expanded"
28
- )
29
 
30
- # Custom CSS
31
- st.markdown("""
32
- <style>
33
- .main-header {
34
- font-size: 3rem;
35
- color: #1f77b4;
36
- text-align: center;
37
- margin-bottom: 2rem;
38
- }
39
- .chat-message {
40
- padding: 1rem;
41
- border-radius: 0.5rem;
42
- margin: 1rem 0;
43
- background-color: #f0f2f6;
44
- }
45
- .source-box {
46
- background-color: #e8f4f8;
47
- padding: 1rem;
48
- border-radius: 0.5rem;
49
- border-left: 4px solid #1f77b4;
50
- }
51
- .doc-summary {
52
- background-color: #f8f9fa;
53
- padding: 1rem;
54
- border-radius: 0.5rem;
55
- border: 1px solid #dee2e6;
56
- margin: 1rem 0;
57
- }
58
- .analysis-card {
59
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
60
- color: white;
61
- padding: 1rem;
62
- border-radius: 0.5rem;
63
- margin: 0.5rem 0;
64
- }
65
- .metric-card {
66
- background-color: #ffffff;
67
- padding: 1rem;
68
- border-radius: 0.5rem;
69
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
70
- text-align: center;
71
- margin: 0.5rem 0;
72
- }
73
- </style>
74
- """, unsafe_allow_html=True)
75
 
76
- # Initialize session state
77
- if 'processed_docs' not in st.session_state:
78
- st.session_state.processed_docs = {}
79
- if 'analysis_cache' not in st.session_state:
80
- st.session_state.analysis_cache = {}
81
-
82
- # Document analysis types
83
- ANALYSIS_TYPES = {
84
- "📊 Financial Summary": {
85
- "description": "Extract key financial metrics, ratios, and performance indicators",
86
- "keywords": ["revenue", "profit", "loss", "assets", "liabilities", "cash flow", "ROI", "margin"],
87
- "icon": "📊"
88
- },
89
- "⚠️ Risk Analysis": {
90
- "description": "Identify potential risks, threats, and vulnerability factors",
91
- "keywords": ["risk", "threat", "vulnerability", "exposure", "mitigation", "hedge", "insurance"],
92
- "icon": "⚠️"
93
- },
94
- "📈 Market Trends": {
95
- "description": "Analyze market conditions, trends, and competitive landscape",
96
- "keywords": ["market", "trend", "growth", "competition", "industry", "outlook", "forecast"],
97
- "icon": "📈"
98
- },
99
- "✅ Compliance Check": {
100
- "description": "Review regulatory compliance and legal requirements",
101
- "keywords": ["compliance", "regulation", "legal", "audit", "governance", "policy", "standard"],
102
- "icon": "✅"
103
- },
104
- "💡 Investment Insights": {
105
- "description": "Extract investment recommendations and opportunities",
106
- "keywords": ["investment", "opportunity", "recommendation", "valuation", "return", "portfolio"],
107
- "icon": "💡"
108
- },
109
- "📋 Executive Summary": {
110
- "description": "Generate high-level overview and key takeaways",
111
- "keywords": ["summary", "overview", "highlights", "conclusion", "recommendation", "action"],
112
- "icon": "📋"
113
- },
114
- "🔍 Detailed Analysis": {
115
- "description": "Comprehensive deep-dive analysis of all content",
116
- "keywords": ["analysis", "detailed", "comprehensive", "thorough", "complete", "full"],
117
- "icon": "🔍"
118
- },
119
- "📊 Data Extraction": {
120
- "description": "Extract tables, numbers, and structured data",
121
- "keywords": ["data", "table", "number", "figure", "statistic", "metric", "KPI"],
122
- "icon": "📊"
123
  }
124
- }
125
-
126
- @st.cache_resource
127
- def load_models():
128
- """Load and cache models with better error handling"""
129
- try:
130
- # Load embedding model first (most reliable)
131
- st.info("Loading embedding model...")
132
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
133
-
134
- # Initialize ChromaDB
135
- st.info("Initializing vector database...")
136
- client = chromadb.Client()
137
- try:
138
- collection = client.get_collection("documents")
139
- except:
140
- collection = client.create_collection(
141
- name="documents",
142
- metadata={"hnsw:space": "cosine"}
143
- )
144
-
145
- st.success("✅ Models loaded successfully!")
146
- return embedding_model, collection
147
-
148
- except Exception as e:
149
- st.error(f"❌ Error loading models: {str(e)}")
150
- st.error("Please check your internet connection and try refreshing the page.")
151
- return None, None
152
-
153
- def validate_file(uploaded_file):
154
- """Validate uploaded file"""
155
- max_size = 50 * 1024 * 1024 # 50MB
156
- if uploaded_file.size > max_size:
157
- return False, f"File {uploaded_file.name} is too large. Maximum size is 50MB."
158
 
159
- allowed_extensions = ['pdf', 'docx', 'txt', 'xlsx', 'xls']
160
- file_extension = uploaded_file.name.split('.')[-1].lower()
161
- if file_extension not in allowed_extensions:
162
- return False, f"File type .{file_extension} is not supported."
163
 
164
- return True, "Valid file"
165
-
166
- def analyze_document_structure(text, filename):
167
- """Analyze document structure and extract metadata"""
168
- analysis = {
169
- 'filename': filename,
170
- 'word_count': len(text.split()),
171
- 'char_count': len(text),
172
- 'estimated_pages': max(1, len(text) // 2000), # Minimum 1 page
173
- 'has_financial_data': bool(re.search(r'\$|€|£|₹|\d+\.\d+%|\d+,\d+', text)),
174
- 'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
175
- 'sections': [],
176
- 'key_terms': [],
177
- 'document_type': 'Unknown'
178
- }
179
 
180
- # Detect document type
181
- text_lower = text.lower()
182
- if any(term in text_lower for term in ['financial statement', 'balance sheet', 'income statement']):
183
- analysis['document_type'] = 'Financial Statement'
184
- elif any(term in text_lower for term in ['annual report', '10-k', '10-q']):
185
- analysis['document_type'] = 'Annual Report'
186
- elif any(term in text_lower for term in ['investment', 'portfolio', 'fund']):
187
- analysis['document_type'] = 'Investment Document'
188
- elif any(term in text_lower for term in ['contract', 'agreement', 'terms']):
189
- analysis['document_type'] = 'Legal Document'
190
- elif any(term in text_lower for term in ['budget', 'forecast', 'projection']):
191
- analysis['document_type'] = 'Financial Planning'
192
- else:
193
- analysis['document_type'] = 'Business Document'
194
 
195
- # Extract sections (headers)
196
- headers = re.findall(r'^[A-Z][A-Za-z\s]{10,50}$', text, re.MULTILINE)
197
- analysis['sections'] = headers[:10] # Top 10 sections
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- # Extract key financial terms
200
- financial_terms = re.findall(r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin|expenses|income|growth|risk|return)\b', text, re.IGNORECASE)
201
- analysis['key_terms'] = list(set(financial_terms))[:15]
 
202
 
203
- return analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- @st.cache_data
206
- def process_document(uploaded_file):
207
- """Process uploaded document with enhanced error handling"""
208
- is_valid, message = validate_file(uploaded_file)
209
- if not is_valid:
210
- raise ValueError(message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  try:
213
- with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
214
- tmp_file.write(uploaded_file.getvalue())
215
- tmp_path = tmp_file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  except Exception as e:
217
- raise ValueError(f"Failed to create temporary file: {str(e)}")
 
 
 
 
 
218
 
219
  try:
220
- file_extension = uploaded_file.name.split('.')[-1].lower()
221
- text = ""
222
 
223
- if file_extension == 'pdf':
224
- try:
225
- with open(tmp_path, 'rb') as file:
226
- reader = PyPDF2.PdfReader(file)
227
- if len(reader.pages) == 0:
228
- raise ValueError("PDF file appears to be empty")
229
- for page in reader.pages:
230
- page_text = page.extract_text()
231
- if page_text:
232
- text += page_text + "\n"
233
- if not text.strip():
234
- raise ValueError("Could not extract text from PDF")
235
- except Exception as e:
236
- raise ValueError(f"Error reading PDF: {str(e)}")
237
 
238
- elif file_extension == 'docx':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  try:
240
- doc = docx.Document(tmp_path)
241
- for paragraph in doc.paragraphs:
242
- if paragraph.text.strip():
243
- text += paragraph.text + "\n"
244
- if not text.strip():
245
- raise ValueError("DOCX file appears to be empty")
246
  except Exception as e:
247
- raise ValueError(f"Error reading DOCX: {str(e)}")
 
248
 
249
- elif file_extension == 'txt':
250
- try:
251
- # Try UTF-8 first
252
- with open(tmp_path, 'r', encoding='utf-8') as file:
253
- text = file.read()
254
- except UnicodeDecodeError:
255
- try:
256
- # Fallback to latin-1
257
- with open(tmp_path, 'r', encoding='latin-1') as file:
258
- text = file.read()
259
- except Exception as e:
260
- raise ValueError(f"Error reading TXT file: {str(e)}")
261
- except Exception as e:
262
- raise ValueError(f"Error reading TXT file: {str(e)}")
263
 
264
- elif file_extension in ['xlsx', 'xls']:
265
- try:
266
- df = pd.read_excel(tmp_path, sheet_name=0) # Read first sheet
267
- if df.empty:
268
- raise ValueError("Excel file appears to be empty")
269
- text = df.to_string(index=False)
270
- except Exception as e:
271
- raise ValueError(f"Error reading Excel file: {str(e)}")
 
 
 
 
272
 
273
- if not text or not text.strip():
274
- raise ValueError("No readable text content found in the file")
 
 
275
 
276
- # Clean up text
277
- text = re.sub(r'\n\s*\n', '\n\n', text) # Remove excessive newlines
278
- text = text.strip()
279
 
280
- # Analyze document structure
281
- analysis = analyze_document_structure(text, uploaded_file.name)
282
 
283
- return text, uploaded_file.name, analysis
284
-
285
- finally:
286
- try:
287
- if os.path.exists(tmp_path):
288
- os.remove(tmp_path)
289
- except:
290
- pass
291
-
292
- def generate_analysis_by_type(text, analysis_type, analysis_info):
293
- """Generate specific analysis based on type"""
294
- keywords = analysis_info['keywords']
295
- description = analysis_info['description']
296
-
297
- # Find relevant sections based on keywords
298
- relevant_sections = []
299
- text_lower = text.lower()
300
-
301
- for keyword in keywords:
302
- if keyword in text_lower:
303
- # Find context around keywords
304
- pattern = rf'.{{0,200}}\b{keyword}\b.{{0,200}}'
305
- matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
306
- relevant_sections.extend(matches[:2]) # Max 2 matches per keyword
307
-
308
- if not relevant_sections:
309
- # If no keyword matches, provide general analysis
310
- words = text.split()
311
- if len(words) > 500:
312
- sample_text = ' '.join(words[:500]) + "..."
313
- else:
314
- sample_text = text
315
 
316
- return f"""
317
- ## {analysis_type}
318
-
319
- **Analysis Focus**: {description}
320
-
321
- **Document Analysis**:
322
- Based on the document content, here are the key insights related to {analysis_type.lower()}:
323
-
324
- {sample_text}
325
-
326
- **Summary**: The document has been analyzed for {analysis_type.lower()} content. While specific keywords weren't found, the above content provides relevant context for your analysis needs.
327
- """
328
-
329
- # Create structured analysis
330
- analysis_result = f"""
331
- ## {analysis_type}
332
-
333
- **Analysis Focus**: {description}
334
-
335
- **Key Findings**:
336
- """
337
-
338
- for i, section in enumerate(relevant_sections[:5], 1):
339
- cleaned_section = re.sub(r'\s+', ' ', section.strip())
340
- if len(cleaned_section) > 300:
341
- cleaned_section = cleaned_section[:300] + "..."
342
- analysis_result += f"\n**Finding {i}**: {cleaned_section}\n"
343
-
344
- analysis_result += f"\n**Summary**: Based on the document analysis, {len(relevant_sections)} relevant sections were identified related to {analysis_type.lower()}. These findings provide insights into the document's content from the perspective of {description.lower()}."
345
-
346
- return analysis_result
347
 
348
- def chunk_text(text, chunk_size=1000, overlap=200):
349
- """Split text into chunks with better handling"""
350
- if not text or not text.strip():
351
- return []
352
-
353
- # Clean text first
354
- text = re.sub(r'\s+', ' ', text.strip())
355
-
356
- chunks = []
357
- start = 0
358
 
359
- while start < len(text):
360
- end = start + chunk_size
 
 
 
 
 
 
 
361
 
362
- if end >= len(text):
363
- # Last chunk
364
- chunk = text[start:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  else:
366
- chunk = text[start:end]
367
- # Try to break at sentence boundary
368
- last_period = chunk.rfind('.')
369
- last_newline = chunk.rfind('\n')
370
- break_point = max(last_period, last_newline)
371
 
372
- if break_point > chunk_size * 0.5: # If we found a good break point
373
- end = start + break_point + 1
374
- chunk = text[start:end]
 
 
 
 
 
 
 
375
 
376
- if chunk.strip() and len(chunk.strip()) > 50: # Only add substantial chunks
377
- chunks.append(chunk.strip())
 
 
 
 
 
 
 
 
 
 
378
 
379
- start = end - overlap
380
 
381
- if start >= len(text):
382
- break
383
-
384
- return chunks
 
 
 
 
 
 
 
385
 
386
- def search_documents(query, collection, embedding_model, n_results=3):
387
- """Search for relevant document chunks with better error handling"""
 
 
388
  try:
389
- if collection.count() == 0:
390
- return []
391
-
392
- # Generate query embedding
393
- query_embedding = embedding_model.encode([query]).tolist()
394
-
395
- # Search the collection
396
- results = collection.query(
397
- query_embeddings=query_embedding,
398
- n_results=min(n_results, collection.count()),
399
- include=['documents', 'metadatas', 'distances']
400
- )
401
-
402
- search_results = []
403
- if results['documents'] and results['documents'][0]:
404
- for i in range(len(results['documents'][0])):
405
- search_results.append({
406
- 'content': results['documents'][0][i],
407
- 'metadata': results['metadatas'][0][i] if results['metadatas'][0] else {},
408
- 'score': 1 - results['distances'][0][i] if results['distances'][0][i] else 1.0
409
- })
410
-
411
- return search_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  except Exception as e:
413
- st.error(f"Search error: {str(e)}")
414
- return []
415
 
416
- def main():
417
- # Header
418
- st.markdown('<h1 class="main-header">💰 FinanceGPT - Enhanced Enterprise AI Assistant</h1>', unsafe_allow_html=True)
419
-
420
- st.markdown("""
421
- <div style="text-align: center; font-size: 1.2rem; color: #666; margin-bottom: 2rem;">
422
- 🚀 Powered by Advanced AI | 📊 Document Intelligence | 🔒 Secure & Compliant
423
- </div>
424
- """, unsafe_allow_html=True)
425
 
426
- # Load models
427
- with st.spinner("🔄 Loading AI models..."):
428
- models = load_models()
429
- if models[0] is None:
430
- st.error("❌ Failed to load AI models. Please refresh the page and check your internet connection.")
431
- st.stop()
432
-
433
- embedding_model, collection = models
434
-
435
- # Sidebar for document management
436
- with st.sidebar:
437
- st.header("📁 Enhanced Document Management")
438
-
439
- # File upload section
440
- st.markdown("### 📤 Upload Documents")
441
- st.info("📋 **File Requirements:**\n- Max size: 50MB per file\n- Formats: PDF, DOCX, TXT, XLSX")
442
-
443
- uploaded_files = st.file_uploader(
444
- "Choose files",
445
- accept_multiple_files=True,
446
- type=['pdf', 'docx', 'txt', 'xlsx'],
447
- help="Supported formats: PDF, DOCX, TXT, XLSX (Max 50MB each)"
448
- )
449
-
450
- if uploaded_files:
451
- valid_files = []
452
- for file in uploaded_files:
453
- is_valid, message = validate_file(file)
454
- if is_valid:
455
- valid_files.append(file)
456
- else:
457
- st.error(f"❌ {message}")
458
-
459
- if valid_files:
460
- st.success(f"✅ {len(valid_files)} valid files ready!")
461
-
462
- if st.button("🔄 Process Documents", type="primary"):
463
- progress_bar = st.progress(0)
464
- status_text = st.empty()
465
-
466
- for i, file in enumerate(valid_files):
467
- status_text.text(f"Processing {file.name}...")
468
-
469
- try:
470
- text, filename, analysis = process_document(file)
471
-
472
- # Store document analysis
473
- st.session_state.processed_docs[filename] = {
474
- 'text': text,
475
- 'analysis': analysis,
476
- 'processed_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
477
- }
478
-
479
- # Create and store chunks
480
- chunks = chunk_text(text)
481
- if chunks:
482
- for j, chunk in enumerate(chunks):
483
- try:
484
- chunk_id = f"{filename}_{j}_{uuid.uuid4().hex[:8]}"
485
- embedding = embedding_model.encode([chunk]).tolist()
486
-
487
- collection.upsert(
488
- embeddings=embedding,
489
- documents=[chunk],
490
- metadatas=[{'filename': filename, 'chunk_id': j}],
491
- ids=[chunk_id]
492
- )
493
- except Exception as e:
494
- st.warning(f"Warning: Could not process chunk {j} of {filename}")
495
- continue
496
-
497
- st.success(f"✅ {filename}")
498
-
499
- except Exception as e:
500
- st.error(f"❌ Error processing {file.name}: {str(e)}")
501
-
502
- progress_bar.progress((i + 1) / len(valid_files))
503
-
504
- status_text.text("✅ Processing complete!")
505
- st.balloons()
506
-
507
- # Document analysis section
508
- if st.session_state.processed_docs:
509
- st.markdown("---")
510
- st.markdown("### 📊 Document Analysis Options")
511
-
512
- # Select document
513
- doc_names = list(st.session_state.processed_docs.keys())
514
- selected_doc = st.selectbox("Select Document:", doc_names)
515
-
516
- if selected_doc:
517
- doc_info = st.session_state.processed_docs[selected_doc]
518
-
519
- # Document overview
520
- st.markdown("#### 📋 Document Overview")
521
- analysis = doc_info['analysis']
522
-
523
- col1, col2 = st.columns(2)
524
- with col1:
525
- st.metric("Word Count", f"{analysis['word_count']:,}")
526
- st.metric("Pages (Est.)", analysis['estimated_pages'])
527
-
528
- with col2:
529
- st.metric("Document Type", analysis['document_type'])
530
- financial_status = "✅ Yes" if analysis['has_financial_data'] else "❌ No"
531
- st.write(f"**Financial Data**: {financial_status}")
532
 
533
- # Key terms
534
- if analysis['key_terms']:
535
- st.markdown("**Key Terms Found:**")
536
- st.write(", ".join(analysis['key_terms'][:10]))
537
 
538
- # Analysis type selection
539
- st.markdown("#### 🔍 Analysis Types")
540
- analysis_type = st.selectbox(
541
- "Choose Analysis Type:",
542
- list(ANALYSIS_TYPES.keys()),
543
- format_func=lambda x: f"{ANALYSIS_TYPES[x]['icon']} {x.split(' ', 1)[1]}"
544
- )
545
 
546
- if st.button(f"🚀 Generate {analysis_type}", use_container_width=True):
547
- cache_key = f"{selected_doc}_{analysis_type}"
548
-
549
- if cache_key not in st.session_state.analysis_cache:
550
- with st.spinner(f"Generating {analysis_type}..."):
551
- analysis_result = generate_analysis_by_type(
552
- doc_info['text'],
553
- analysis_type,
554
- ANALYSIS_TYPES[analysis_type]
555
- )
556
- st.session_state.analysis_cache[cache_key] = analysis_result
557
-
558
- # Display in main area
559
- st.session_state.current_analysis = st.session_state.analysis_cache[cache_key]
560
- st.session_state.current_analysis_type = analysis_type
561
 
562
- # Main content area
563
- col1, col2 = st.columns([2, 1])
564
-
565
- with col1:
566
- # Display analysis results if available
567
- if hasattr(st.session_state, 'current_analysis'):
568
- st.markdown(f"## {st.session_state.current_analysis_type}")
569
- st.markdown(f'<div class="analysis-card">{st.session_state.current_analysis}</div>', unsafe_allow_html=True)
570
-
571
- # Clear analysis button
572
- if st.button("🗑️ Clear Analysis"):
573
- if hasattr(st.session_state, 'current_analysis'):
574
- del st.session_state.current_analysis
575
- if hasattr(st.session_state, 'current_analysis_type'):
576
- del st.session_state.current_analysis_type
577
- st.rerun()
578
-
579
- st.header("💬 Interactive Q&A")
580
-
581
- # Smart question suggestions
582
- if st.session_state.processed_docs:
583
- with st.expander("💡 Smart Question Suggestions"):
584
- # Generate context-aware questions
585
- doc_types = set(doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values())
586
-
587
- smart_questions = []
588
- if 'Financial Statement' in doc_types:
589
- smart_questions.extend([
590
- "What are the key financial ratios mentioned?",
591
- "Analyze the profitability trends",
592
- "What are the major expense categories?"
593
- ])
594
- if 'Investment Document' in doc_types:
595
- smart_questions.extend([
596
- "What are the investment recommendations?",
597
- "What risks are associated with these investments?",
598
- "What is the expected return on investment?"
599
- ])
600
- if 'Annual Report' in doc_types:
601
- smart_questions.extend([
602
- "Summarize the company's performance this year",
603
- "What are the future growth strategies?",
604
- "What challenges does the company face?"
605
- ])
606
-
607
- # Default questions if no specific type detected
608
- if not smart_questions:
609
- smart_questions = [
610
- "What are the key points in this document?",
611
- "Summarize the main findings",
612
- "What are the most important numbers mentioned?"
613
- ]
614
-
615
- for question in smart_questions[:6]:
616
- if st.button(question, key=f"smart_{question}", use_container_width=True):
617
- st.session_state.query = question
618
-
619
- # Query input
620
- query = st.text_area(
621
- "Enter your question:",
622
- value=st.session_state.get('query', ''),
623
- placeholder="e.g., What are the main financial risks identified in the documents?",
624
- height=100
625
- )
626
-
627
- if st.button("🔍 Ask Question", type="primary", use_container_width=True):
628
- if not query:
629
- st.warning("⚠️ Please enter a question!")
630
- return
631
-
632
- if collection.count() == 0:
633
- st.warning("⚠️ Please upload and process some documents first!")
634
- return
635
-
636
- with st.spinner("🤖 Analyzing documents and generating response..."):
637
- try:
638
- search_results = search_documents(query, collection, embedding_model)
639
-
640
- if search_results:
641
- # Enhanced response generation
642
- context = ""
643
- source_files = set()
644
-
645
- for i, chunk in enumerate(search_results):
646
- filename = chunk['metadata'].get('filename', 'Unknown')
647
- source_files.add(filename)
648
- context += f"[Source {i+1}: {filename}]\n{chunk['content'][:400]}...\n\n"
649
-
650
- response = f"""
651
- ### 🤖 AI Analysis Results
652
 
653
- **Query**: {query}
654
-
655
- **Key Findings**:
656
- {context[:1500]}...
657
-
658
- **Summary**: Based on analysis of {len(search_results)} relevant sections from {len(source_files)} document(s), the information above directly addresses your question.
659
-
660
- **Documents Analyzed**: {', '.join(source_files)}
661
- """
662
-
663
- st.markdown(response)
664
-
665
- # Enhanced source display
666
- st.markdown("### 📚 Detailed Sources")
667
- for i, result in enumerate(search_results):
668
- score_percent = f"{result['score']:.1%}"
669
- filename = result['metadata'].get('filename', 'Unknown')
670
-
671
- with st.expander(f"📄 Source {i+1}: {filename} (Relevance: {score_percent})"):
672
- st.markdown(f'<div class="source-box">{result["content"]}</div>', unsafe_allow_html=True)
673
- else:
674
- st.error("❌ No relevant information found in the uploaded documents.")
675
-
676
- except Exception as e:
677
- st.error(f"❌ Error processing your question: {str(e)}")
678
-
679
- with col2:
680
- st.header("📊 Dashboard")
681
 
682
- # Document statistics
683
- if st.session_state.processed_docs:
684
- st.markdown("### 📈 Document Statistics")
685
-
686
- total_words = sum(doc['analysis']['word_count'] for doc in st.session_state.processed_docs.values())
687
- total_pages = sum(doc['analysis']['estimated_pages'] for doc in st.session_state.processed_docs.values())
688
- doc_types = [doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values()]
689
-
690
- col_a, col_b = st.columns(2)
691
- with col_a:
692
- st.metric("📄 Documents", len(st.session_state.processed_docs))
693
- st.metric("📊 Total Words", f"{total_words:,}")
694
- with col_b:
695
- st.metric("📑 Total Pages", total_pages)
696
- st.metric("🗂️ Document Types", len(set(doc_types)))
697
-
698
- # Document type breakdown
699
- if doc_types:
700
- st.markdown("**Document Types:**")
701
- type_counts = {}
702
- for doc_type in doc_types:
703
- type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
704
-
705
- for doc_type, count in type_counts.items():
706
- st.write(f"• {doc_type}: {count}")
707
-
708
- # Project info
709
- st.markdown("---")
710
- st.header("🎯 Project Info")
711
-
712
- st.markdown("""
713
- ### **Enterprise AI Assistant**
714
-
715
- **🔧 Technology Stack:**
716
- - 🧠 Advanced AI Models
717
- - 🔍 RAG (Retrieval-Augmented Generation)
718
- - 📊 Streamlit UI
719
- - 🗄️ ChromaDB Vector Database
720
- - 🔒 Enterprise Security
721
-
722
- **💼 Analysis Types:**
723
- - 📊 Financial Summary
724
- - ⚠️ Risk Analysis
725
- - 📈 Market Trends
726
- - ✅ Compliance Check
727
- - 💡 Investment Insights
728
- - 📋 Executive Summary
729
- - 🔍 Detailed Analysis
730
- - 📊 Data Extraction
731
- """)
732
-
733
- # Statistics
734
- try:
735
- doc_count = collection.count()
736
- st.metric("🔗 Vector Chunks", doc_count)
737
- except:
738
- st.metric("🔗 Vector Chunks", 0)
739
 
740
  if __name__ == "__main__":
741
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import PyPDF2
2
+ import pdfplumber
3
+ import fitz # PyMuPDF
4
  import pandas as pd
 
 
 
 
 
5
  import re
6
+ import logging
7
+ import os
8
+ from typing import Dict, List, Tuple, Optional
9
+ from pathlib import Path
10
 
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
 
 
 
 
14
 
15
+ class PDFProcessorError(Exception):
16
+ """Custom exception for PDF processing errors"""
17
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def enhanced_pdf_processor(file_path: str, timeout: int = 30) -> Dict:
20
+ """
21
+ Enhanced PDF processor with robust error handling and multiple extraction methods
22
+ for better handling of complex PDFs like IBM reports
23
+ """
24
+
25
+ results = {
26
+ 'text': '',
27
+ 'tables': [],
28
+ 'metadata': {},
29
+ 'extraction_method': 'unknown',
30
+ 'success': False,
31
+ 'error': None,
32
+ 'file_info': {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Validate file
36
+ if not validate_pdf_file(file_path):
37
+ results['error'] = "Invalid PDF file or file doesn't exist"
38
+ return results
39
 
40
+ # Get file info
41
+ results['file_info'] = get_file_info(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Try different extraction methods in order of preference
44
+ extraction_methods = [
45
+ ('PyMuPDF', extract_with_pymupdf),
46
+ ('pdfplumber', extract_with_pdfplumber),
47
+ ('PyPDF2', extract_with_pypdf2)
48
+ ]
 
 
 
 
 
 
 
 
49
 
50
+ for method_name, method_func in extraction_methods:
51
+ try:
52
+ logger.info(f"Trying extraction method: {method_name}")
53
+
54
+ if method_name == 'pdfplumber':
55
+ text_result, tables = method_func(file_path)
56
+ if text_result and len(text_result.strip()) > 50:
57
+ results['text'] = text_result
58
+ results['tables'] = tables
59
+ results['extraction_method'] = method_name
60
+ results['success'] = True
61
+ logger.info(f"Successfully extracted with {method_name}")
62
+ return results
63
+ elif method_name == 'PyMuPDF':
64
+ text_result, metadata = method_func(file_path)
65
+ if text_result and len(text_result.strip()) > 50:
66
+ results['text'] = text_result
67
+ results['metadata'] = metadata
68
+ results['extraction_method'] = method_name
69
+ results['success'] = True
70
+ logger.info(f"Successfully extracted with {method_name}")
71
+ return results
72
+ else: # PyPDF2
73
+ text_result = method_func(file_path)
74
+ if text_result and len(text_result.strip()) > 50:
75
+ results['text'] = text_result
76
+ results['extraction_method'] = method_name
77
+ results['success'] = True
78
+ logger.info(f"Successfully extracted with {method_name}")
79
+ return results
80
+
81
+ except Exception as e:
82
+ error_msg = f"{method_name} failed: {str(e)}"
83
+ logger.warning(error_msg)
84
+ results['error'] = error_msg
85
+ continue
86
 
87
+ # If all methods failed
88
+ if not results['success']:
89
+ results['error'] = "All extraction methods failed"
90
+ logger.error("All PDF extraction methods failed")
91
 
92
+ return results
93
+
94
+ def validate_pdf_file(file_path: str) -> bool:
95
+ """Validate PDF file exists and is accessible"""
96
+ try:
97
+ path = Path(file_path)
98
+ if not path.exists():
99
+ logger.error(f"File does not exist: {file_path}")
100
+ return False
101
+
102
+ if not path.is_file():
103
+ logger.error(f"Path is not a file: {file_path}")
104
+ return False
105
+
106
+ if path.stat().st_size == 0:
107
+ logger.error(f"File is empty: {file_path}")
108
+ return False
109
+
110
+ # Check if file is actually a PDF
111
+ with open(file_path, 'rb') as f:
112
+ header = f.read(5)
113
+ if not header.startswith(b'%PDF-'):
114
+ logger.error(f"File is not a valid PDF: {file_path}")
115
+ return False
116
+
117
+ return True
118
+
119
+ except Exception as e:
120
+ logger.error(f"Error validating PDF file: {e}")
121
+ return False
122
 
123
+ def get_file_info(file_path: str) -> Dict:
124
+ """Get basic file information"""
125
+ try:
126
+ path = Path(file_path)
127
+ stat = path.stat()
128
+ return {
129
+ 'name': path.name,
130
+ 'size': stat.st_size,
131
+ 'size_mb': round(stat.st_size / (1024 * 1024), 2),
132
+ 'modified': stat.st_mtime
133
+ }
134
+ except Exception as e:
135
+ logger.warning(f"Could not get file info: {e}")
136
+ return {}
137
+
138
+ def extract_with_pypdf2(file_path: str) -> str:
139
+ """Extract text using PyPDF2 - fastest method"""
140
+ text = ""
141
+ try:
142
+ with open(file_path, 'rb') as file:
143
+ reader = PyPDF2.PdfReader(file)
144
+
145
+ # Check if PDF is encrypted
146
+ if reader.is_encrypted:
147
+ raise PDFProcessorError("PDF is encrypted and cannot be processed")
148
+
149
+ for page_num, page in enumerate(reader.pages):
150
+ try:
151
+ page_text = page.extract_text()
152
+ if page_text:
153
+ text += f"\n--- Page {page_num + 1} ---\n"
154
+ text += page_text + "\n"
155
+ except Exception as e:
156
+ logger.warning(f"Failed to extract text from page {page_num + 1}: {e}")
157
+ continue
158
+
159
+ return clean_extracted_text(text)
160
+
161
+ except Exception as e:
162
+ raise PDFProcessorError(f"PyPDF2 extraction failed: {e}")
163
+
164
+ def extract_with_pdfplumber(file_path: str) -> Tuple[str, List[Dict]]:
165
+ """Extract text and tables using pdfplumber - better for structured docs"""
166
+ text = ""
167
+ tables = []
168
 
169
  try:
170
+ with pdfplumber.open(file_path) as pdf:
171
+ for page_num, page in enumerate(pdf.pages):
172
+ try:
173
+ # Extract text
174
+ page_text = page.extract_text()
175
+ if page_text:
176
+ text += f"\n--- Page {page_num + 1} ---\n"
177
+ text += page_text + "\n"
178
+
179
+ # Extract tables
180
+ page_tables = page.extract_tables()
181
+ for table_num, table in enumerate(page_tables):
182
+ if table and len(table) > 1 and any(any(cell for cell in row if cell) for row in table):
183
+ tables.append({
184
+ 'page': page_num + 1,
185
+ 'table_number': table_num + 1,
186
+ 'data': table,
187
+ 'text_representation': table_to_text(table)
188
+ })
189
+
190
+ except Exception as e:
191
+ logger.warning(f"Failed to process page {page_num + 1}: {e}")
192
+ continue
193
+
194
+ return clean_extracted_text(text), tables
195
+
196
  except Exception as e:
197
+ raise PDFProcessorError(f"pdfplumber extraction failed: {e}")
198
+
199
+ def extract_with_pymupdf(file_path: str) -> Tuple[str, Dict]:
200
+ """Extract text using PyMuPDF - most robust method"""
201
+ text = ""
202
+ metadata = {}
203
 
204
  try:
205
+ doc = fitz.open(file_path)
 
206
 
207
+ # Check if document is valid
208
+ if doc.is_closed:
209
+ raise PDFProcessorError("Could not open PDF document")
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ # Extract metadata safely
212
+ try:
213
+ doc_metadata = doc.metadata or {}
214
+ metadata = {
215
+ 'page_count': doc.page_count,
216
+ 'title': doc_metadata.get('title', ''),
217
+ 'author': doc_metadata.get('author', ''),
218
+ 'subject': doc_metadata.get('subject', ''),
219
+ 'creator': doc_metadata.get('creator', ''),
220
+ 'producer': doc_metadata.get('producer', ''),
221
+ 'creation_date': doc_metadata.get('creationDate', ''),
222
+ 'modification_date': doc_metadata.get('modDate', '')
223
+ }
224
+ except Exception as e:
225
+ logger.warning(f"Could not extract metadata: {e}")
226
+ metadata = {'page_count': doc.page_count}
227
+
228
+ # Extract text
229
+ for page_num in range(doc.page_count):
230
  try:
231
+ page = doc[page_num]
232
+ page_text = page.get_text()
233
+ if page_text:
234
+ text += f"\n--- Page {page_num + 1} ---\n"
235
+ text += page_text + "\n"
 
236
  except Exception as e:
237
+ logger.warning(f"Failed to extract text from page {page_num + 1}: {e}")
238
+ continue
239
 
240
+ doc.close()
241
+ return clean_extracted_text(text), metadata
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ except Exception as e:
244
+ raise PDFProcessorError(f"PyMuPDF extraction failed: {e}")
245
+
246
+ def clean_extracted_text(text: str) -> str:
247
+ """Clean and normalize extracted text"""
248
+ if not text:
249
+ return ""
250
+
251
+ try:
252
+ # Remove excessive whitespace
253
+ text = re.sub(r'\n\s*\n', '\n\n', text)
254
+ text = re.sub(r' +', ' ', text)
255
 
256
+ # Fix common PDF extraction issues
257
+ text = text.replace('\ufffd', '') # Remove unicode replacement chars
258
+ text = text.replace('\x00', '') # Remove null characters
259
+ text = text.replace('\u200b', '') # Remove zero-width space
260
 
261
+ # Normalize line breaks
262
+ text = text.replace('\r\n', '\n')
263
+ text = text.replace('\r', '\n')
264
 
265
+ # Remove control characters except newlines and tabs
266
+ text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
267
 
268
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
+ except Exception as e:
271
+ logger.warning(f"Error cleaning text: {e}")
272
+ return text.strip() if text else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ def table_to_text(table: List[List]) -> str:
275
+ """Convert table data to readable text format"""
276
+ if not table:
277
+ return ""
 
 
 
 
 
 
278
 
279
+ try:
280
+ text_lines = []
281
+ for row in table:
282
+ if row: # Skip empty rows
283
+ clean_row = [str(cell).strip() if cell else "" for cell in row]
284
+ if any(clean_row): # Only add non-empty rows
285
+ text_lines.append(" | ".join(clean_row))
286
+
287
+ return "\n".join(text_lines)
288
 
289
+ except Exception as e:
290
+ logger.warning(f"Error converting table to text: {e}")
291
+ return ""
292
+
293
+ def detect_ibm_document_type(text: str, metadata: Dict) -> str:
294
+ """Detect specific IBM document types"""
295
+ try:
296
+ text_lower = text.lower()
297
+ title_lower = metadata.get('title', '').lower()
298
+
299
+ # IBM-specific patterns
300
+ if any(term in text_lower for term in ['ibm annual report', 'international business machines']):
301
+ return 'IBM Annual Report'
302
+ elif any(term in text_lower for term in ['ibm research', 'watson', 'artificial intelligence']):
303
+ return 'IBM Research Document'
304
+ elif any(term in text_lower for term in ['red hat', 'openshift', 'kubernetes']):
305
+ return 'IBM Cloud/Red Hat Document'
306
+ elif any(term in text_lower for term in ['mainframe', 'z systems', 'power systems']):
307
+ return 'IBM Hardware Documentation'
308
+ elif any(term in text_lower for term in ['cognos', 'spss', 'analytics']):
309
+ return 'IBM Analytics Document'
310
+ elif 'ibm' in text_lower:
311
+ return 'IBM Business Document'
312
  else:
313
+ return 'General Document'
 
 
 
 
314
 
315
+ except Exception as e:
316
+ logger.warning(f"Error detecting document type: {e}")
317
+ return 'Unknown Document'
318
+
319
+ def process_ibm_pdf(file_path: str) -> Dict:
320
+ """
321
+ Process IBM PDF with enhanced extraction and error handling
322
+ """
323
+ try:
324
+ result = enhanced_pdf_processor(file_path)
325
 
326
+ if result['success']:
327
+ # Detect IBM document type
328
+ doc_type = detect_ibm_document_type(result['text'], result['metadata'])
329
+ result['document_type'] = doc_type
330
+
331
+ # Extract IBM-specific metrics if it's a financial document
332
+ if 'annual report' in doc_type.lower():
333
+ result['financial_metrics'] = extract_ibm_financial_metrics(result['text'])
334
+
335
+ # Process tables for better analysis
336
+ if result['tables']:
337
+ result['structured_data'] = process_ibm_tables(result['tables'])
338
 
339
+ return result
340
 
341
+ except Exception as e:
342
+ logger.error(f"Error processing IBM PDF: {e}")
343
+ return {
344
+ 'text': '',
345
+ 'tables': [],
346
+ 'metadata': {},
347
+ 'extraction_method': 'unknown',
348
+ 'success': False,
349
+ 'error': str(e),
350
+ 'document_type': 'Unknown'
351
+ }
352
 
353
+ def extract_ibm_financial_metrics(text: str) -> Dict:
354
+ """Extract IBM-specific financial metrics"""
355
+ metrics = {}
356
+
357
  try:
358
+ # Revenue patterns (more comprehensive)
359
+ revenue_patterns = [
360
+ r'(?:total\s+)?revenue[:\s]+\$?([\d,]+(?:\.\d+)?)\s*(?:million|billion)?',
361
+ r'total\s+revenue[:\s]+\$?([\d,]+(?:\.\d+)?)',
362
+ r'net\s+revenue[:\s]+\$?([\d,]+(?:\.\d+)?)'
363
+ ]
364
+
365
+ for pattern in revenue_patterns:
366
+ revenue_match = re.search(pattern, text, re.IGNORECASE)
367
+ if revenue_match:
368
+ metrics['revenue'] = revenue_match.group(1)
369
+ break
370
+
371
+ # Net income patterns
372
+ income_patterns = [
373
+ r'net\s+income[:\s]+\$?([\d,]+(?:\.\d+)?)\s*(?:million|billion)?',
374
+ r'net\s+earnings[:\s]+\$?([\d,]+(?:\.\d+)?)',
375
+ r'income\s+from\s+continuing\s+operations[:\s]+\$?([\d,]+(?:\.\d+)?)'
376
+ ]
377
+
378
+ for pattern in income_patterns:
379
+ income_match = re.search(pattern, text, re.IGNORECASE)
380
+ if income_match:
381
+ metrics['net_income'] = income_match.group(1)
382
+ break
383
+
384
+ # Earnings per share
385
+ eps_patterns = [
386
+ r'earnings\s+per\s+share[:\s]+\$?([\d,]+(?:\.\d+)?)',
387
+ r'diluted\s+earnings\s+per\s+share[:\s]+\$?([\d,]+(?:\.\d+)?)',
388
+ r'basic\s+earnings\s+per\s+share[:\s]+\$?([\d,]+(?:\.\d+)?)'
389
+ ]
390
+
391
+ for pattern in eps_patterns:
392
+ eps_match = re.search(pattern, text, re.IGNORECASE)
393
+ if eps_match:
394
+ metrics['eps'] = eps_match.group(1)
395
+ break
396
+
397
+ return metrics
398
+
399
  except Exception as e:
400
+ logger.warning(f"Error extracting financial metrics: {e}")
401
+ return {}
402
 
403
+ def process_ibm_tables(tables: List[Dict]) -> List[Dict]:
404
+ """Process IBM tables for better structure"""
405
+ processed_tables = []
 
 
 
 
 
 
406
 
407
+ for table in tables:
408
+ try:
409
+ # Convert table to DataFrame for better processing
410
+ if table.get('data') and len(table['data']) > 1:
411
+ df = pd.DataFrame(table['data'][1:], columns=table['data'][0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
+ # Clean and process
414
+ df = df.dropna(how='all') # Remove empty rows
415
+ df = df.fillna('') # Fill NaN with empty string
 
416
 
417
+ # Remove completely empty columns
418
+ df = df.loc[:, (df != '').any(axis=0)]
 
 
 
 
 
419
 
420
+ if not df.empty:
421
+ processed_tables.append({
422
+ 'page': table.get('page', 0),
423
+ 'table_number': table.get('table_number', 0),
424
+ 'dataframe': df,
425
+ 'summary': f"Table with {len(df)} rows and {len(df.columns)} columns",
426
+ 'text': df.to_string(index=False)
427
+ })
428
+ except Exception as e:
429
+ logger.warning(f"Error processing table: {e}")
430
+ # If DataFrame conversion fails, keep original
431
+ processed_tables.append(table)
 
 
 
432
 
433
+ return processed_tables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
+ # Additional utility functions for web integration
436
+ def safe_process_pdf(file_path: str, max_file_size_mb: int = 50) -> Dict:
437
+ """
438
+ Safely process PDF with size and security checks
439
+ """
440
+ try:
441
+ # Check file size
442
+ if os.path.getsize(file_path) > max_file_size_mb * 1024 * 1024:
443
+ return {
444
+ 'success': False,
445
+ 'error': f'File too large. Maximum size: {max_file_size_mb}MB'
446
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
+ # Process the PDF
449
+ return process_ibm_pdf(file_path)
450
+
451
+ except Exception as e:
452
+ logger.error(f"Safe PDF processing failed: {e}")
453
+ return {
454
+ 'success': False,
455
+ 'error': f'Processing failed: {str(e)}'
456
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
  if __name__ == "__main__":
459
+ # Example usage
460
+ pdf_path = "demo.pdf" # Replace with your PDF path
461
+
462
+ result = safe_process_pdf(pdf_path)
463
+
464
+ if result['success']:
465
+ print(f"Successfully processed PDF using {result['extraction_method']}")
466
+ print(f"Document type: {result.get('document_type', 'Unknown')}")
467
+ print(f"Text length: {len(result['text'])} characters")
468
+ print(f"Number of tables: {len(result['tables'])}")
469
+
470
+ if result.get('financial_metrics'):
471
+ print("Financial metrics found:")
472
+ for metric, value in result['financial_metrics'].items():
473
+ print(f" {metric}: {value}")
474
+ else:
475
+ print(f"Failed to process PDF: {result['error']}")