SimranShaikh commited on
Commit
cd9e823
Β·
verified Β·
1 Parent(s): 203cee9
Files changed (1) hide show
  1. src/streamlit_app.py +582 -351
src/streamlit_app.py CHANGED
@@ -1,19 +1,159 @@
1
  import streamlit as st
2
  import os
3
  import tempfile
 
 
 
 
 
 
 
 
4
  import PyPDF2
5
- import fitz # PyMuPDF - better for large PDFs
6
- import io
7
- from PIL import Image
8
- import pytesseract # For OCR if needed
 
 
 
 
 
9
 
10
- # Enhanced file validation with configurable size limits
11
- def validate_file(uploaded_file, max_size_mb=100): # Increased default to 100MB
12
- """Enhanced file validation with configurable size limits"""
13
- max_size = max_size_mb * 1024 * 1024 # Convert MB to bytes
14
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if uploaded_file.size > max_size:
16
- return False, f"File {uploaded_file.name} is too large. Maximum size is {max_size_mb}MB."
17
 
18
  allowed_extensions = ['pdf', 'docx', 'txt', 'xlsx', 'xls']
19
  file_extension = uploaded_file.name.split('.')[-1].lower()
@@ -22,120 +162,47 @@ def validate_file(uploaded_file, max_size_mb=100): # Increased default to 100MB
22
 
23
  return True, "Valid file"
24
 
25
- def process_large_pdf_pymupdf(file_path, max_pages=None, progress_callback=None):
26
- """
27
- Process large PDF using PyMuPDF (faster and more memory efficient)
28
- """
29
- text = ""
30
- try:
31
- doc = fitz.open(file_path)
32
- total_pages = len(doc)
33
-
34
- # Limit pages if specified
35
- pages_to_process = min(total_pages, max_pages) if max_pages else total_pages
36
-
37
- for page_num in range(pages_to_process):
38
- if progress_callback:
39
- progress_callback(page_num + 1, pages_to_process)
40
-
41
- page = doc[page_num]
42
- page_text = page.get_text()
43
-
44
- # If page has very little text, try OCR on images
45
- if len(page_text.strip()) < 50:
46
- try:
47
- # Get page as image and apply OCR
48
- pix = page.get_pixmap()
49
- img_data = pix.tobytes("png")
50
- img = Image.open(io.BytesIO(img_data))
51
- ocr_text = pytesseract.image_to_string(img)
52
- if len(ocr_text.strip()) > len(page_text.strip()):
53
- page_text = ocr_text
54
- except Exception as e:
55
- # OCR failed, continue with extracted text
56
- pass
57
-
58
- text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
59
-
60
- doc.close()
61
- return text, total_pages
62
-
63
- except Exception as e:
64
- raise ValueError(f"Error processing PDF with PyMuPDF: {str(e)}")
65
-
66
- def process_large_pdf_streaming(file_path, chunk_size=1024*1024, max_pages=None):
67
- """
68
- Process large PDF in streaming fashion to handle memory constraints
69
- """
70
- text = ""
71
- try:
72
- with open(file_path, 'rb') as file:
73
- reader = PyPDF2.PdfReader(file)
74
- total_pages = len(reader.pages)
75
-
76
- # Limit pages if specified
77
- pages_to_process = min(total_pages, max_pages) if max_pages else total_pages
78
-
79
- for page_num in range(pages_to_process):
80
- try:
81
- page = reader.pages[page_num]
82
- page_text = page.extract_text()
83
- text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
84
-
85
- # Yield control periodically to prevent blocking
86
- if page_num % 10 == 0: # Every 10 pages
87
- # In Streamlit, you might want to update progress here
88
- pass
89
-
90
- except Exception as e:
91
- # Skip problematic pages
92
- text += f"\n--- Page {page_num + 1} (Error) ---\nError extracting text: {str(e)}\n"
93
- continue
94
-
95
- return text, total_pages
96
-
97
- except Exception as e:
98
- raise ValueError(f"Error processing PDF with streaming: {str(e)}")
99
-
100
- def compress_pdf_text(text, compression_ratio=0.7):
101
- """
102
- Compress extracted text by removing redundant content
103
- """
104
- lines = text.split('\n')
105
- compressed_lines = []
106
- seen_lines = set()
107
 
108
- for line in lines:
109
- # Remove extra whitespace
110
- cleaned_line = ' '.join(line.split())
111
-
112
- # Skip empty lines and very short lines
113
- if len(cleaned_line) < 3:
114
- continue
115
-
116
- # Skip duplicate lines (common in headers/footers)
117
- if cleaned_line in seen_lines:
118
- continue
119
-
120
- seen_lines.add(cleaned_line)
121
- compressed_lines.append(cleaned_line)
122
-
123
- # Stop if we've reached the compression target
124
- if len(compressed_lines) >= len(lines) * compression_ratio:
125
- break
126
 
127
- return '\n'.join(compressed_lines)
128
 
129
  @st.cache_data
130
- def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, use_compression=True):
131
- """
132
- Enhanced document processing with support for larger files
133
- """
134
- is_valid, message = validate_file(uploaded_file, max_size_mb)
135
  if not is_valid:
136
  raise ValueError(message)
137
 
138
- # Create temporary file
139
  try:
140
  with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
141
  tmp_file.write(uploaded_file.getvalue())
@@ -146,58 +213,24 @@ def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, us
146
  try:
147
  file_extension = uploaded_file.name.split('.')[-1].lower()
148
  text = ""
149
- total_pages = 0
150
 
151
  if file_extension == 'pdf':
152
- # Progress callback for Streamlit
153
- progress_bar = st.progress(0)
154
- status_text = st.empty()
155
-
156
- def update_progress(current_page, total_pages):
157
- progress = current_page / total_pages
158
- progress_bar.progress(progress)
159
- status_text.text(f"Processing page {current_page} of {total_pages}")
160
-
161
- # Try PyMuPDF first (better for large files)
162
  try:
163
- status_text.text("Using PyMuPDF for better performance...")
164
- text, total_pages = process_large_pdf_pymupdf(
165
- tmp_path,
166
- max_pages,
167
- update_progress
168
- )
169
  except Exception as e:
170
- # Fallback to PyPDF2 with streaming
171
- status_text.text("Falling back to PyPDF2...")
172
- text, total_pages = process_large_pdf_streaming(tmp_path, max_pages=max_pages)
173
-
174
- # Clean up progress indicators
175
- progress_bar.empty()
176
- status_text.empty()
177
-
178
  elif file_extension == 'docx':
179
- # Handle large DOCX files
180
  try:
181
- import docx
182
  doc = docx.Document(tmp_path)
183
- paragraphs_processed = 0
184
- total_paragraphs = len(doc.paragraphs)
185
-
186
- progress_bar = st.progress(0)
187
- for i, paragraph in enumerate(doc.paragraphs):
188
  text += paragraph.text + "\n"
189
- paragraphs_processed += 1
190
-
191
- # Update progress every 100 paragraphs
192
- if paragraphs_processed % 100 == 0:
193
- progress_bar.progress(paragraphs_processed / total_paragraphs)
194
-
195
- progress_bar.empty()
196
-
197
  except Exception as e:
198
  raise ValueError(f"Error reading DOCX: {str(e)}")
199
 
200
- # Handle other file types (TXT, Excel) - existing code
201
  elif file_extension == 'txt':
202
  try:
203
  with open(tmp_path, 'r', encoding='utf-8') as file:
@@ -205,10 +238,11 @@ def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, us
205
  except UnicodeDecodeError:
206
  with open(tmp_path, 'r', encoding='latin-1') as file:
207
  text = file.read()
 
 
208
 
209
  elif file_extension in ['xlsx', 'xls']:
210
  try:
211
- import pandas as pd
212
  df = pd.read_excel(tmp_path)
213
  text = df.to_string()
214
  except Exception as e:
@@ -217,234 +251,431 @@ def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, us
217
  if not text.strip():
218
  raise ValueError("No text content found in the file")
219
 
220
- # Apply compression if requested
221
- if use_compression and len(text) > 50000: # Compress if text > 50k chars
222
- original_length = len(text)
223
- text = compress_pdf_text(text)
224
- st.info(f"πŸ“Š Text compressed: {original_length:,} β†’ {len(text):,} characters")
225
-
226
- # Enhanced analysis
227
- analysis = analyze_document_structure_enhanced(text, uploaded_file.name, total_pages)
228
 
229
  return text, uploaded_file.name, analysis
230
 
231
  finally:
232
- # Clean up temporary file
233
  try:
234
  if os.path.exists(tmp_path):
235
  os.remove(tmp_path)
236
  except:
237
  pass
238
 
239
- def analyze_document_structure_enhanced(text, filename, total_pages=0):
240
- """Enhanced document structure analysis"""
241
- import re
242
-
243
- analysis = {
244
- 'filename': filename,
245
- 'word_count': len(text.split()),
246
- 'char_count': len(text),
247
- 'total_pages': total_pages,
248
- 'estimated_pages': total_pages or len(text) // 2000,
249
- 'has_financial_data': bool(re.search(r'\$|€|Β£|β‚Ή|\d+\.\d+%|\d+,\d+', text)),
250
- 'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
251
- 'sections': [],
252
- 'key_terms': [],
253
- 'document_type': 'Unknown',
254
- 'language_detected': 'English', # You could add language detection here
255
- 'complexity_score': 0
256
- }
257
-
258
- # Calculate complexity score based on various factors
259
- complexity_factors = [
260
- len(text) > 100000, # Very long document
261
- analysis['has_financial_data'], # Contains financial data
262
- analysis['has_tables'], # Contains tables
263
- len(re.findall(r'\d+', text)) > 1000, # Many numbers
264
- len(re.findall(r'[A-Z]{2,}', text)) > 100, # Many acronyms
265
- ]
266
- analysis['complexity_score'] = sum(complexity_factors)
267
 
268
- # Detect document type with more sophisticated rules
 
269
  text_lower = text.lower()
270
- if any(term in text_lower for term in ['financial statement', 'balance sheet', 'income statement', 'cash flow']):
271
- analysis['document_type'] = 'Financial Statement'
272
- elif any(term in text_lower for term in ['annual report', '10-k', '10-q', 'sec filing']):
273
- analysis['document_type'] = 'Annual Report'
274
- elif any(term in text_lower for term in ['investment', 'portfolio', 'fund', 'prospectus']):
275
- analysis['document_type'] = 'Investment Document'
276
- elif any(term in text_lower for term in ['contract', 'agreement', 'terms', 'legal']):
277
- analysis['document_type'] = 'Legal Document'
278
- elif any(term in text_lower for term in ['research', 'analysis', 'study', 'report']):
279
- analysis['document_type'] = 'Research Report'
280
-
281
- # Extract sections (improved)
282
- headers = re.findall(r'^[A-Z][A-Za-z\s]{5,50}$', text, re.MULTILINE)
283
- # Also look for numbered sections
284
- numbered_sections = re.findall(r'^\d+\.\s+[A-Z][A-Za-z\s]{5,50}$', text, re.MULTILINE)
285
 
286
- all_headers = list(set(headers + numbered_sections))
287
- analysis['sections'] = all_headers[:15] # Top 15 sections
 
 
 
 
288
 
289
- # Extract key financial and business terms
290
- important_terms = re.findall(
291
- r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin|growth|risk|compliance|strategy|market|competition|valuation|dividend|earnings|expenses|budget|forecast)\b',
292
- text,
293
- re.IGNORECASE
294
- )
295
- analysis['key_terms'] = list(set(important_terms))[:20]
296
 
297
- return analysis
 
 
 
 
298
 
299
- # Configuration options for the sidebar
300
- def add_file_processing_options():
301
- """Add file processing options to sidebar"""
302
- st.sidebar.markdown("### βš™οΈ Processing Options")
303
 
304
- # File size limit
305
- max_size = st.sidebar.slider(
306
- "Max File Size (MB)",
307
- min_value=10,
308
- max_value=500,
309
- value=100,
310
- step=10,
311
- help="Increase for larger files, but may consume more memory"
312
- )
313
 
314
- # Page limit for PDFs
315
- limit_pages = st.sidebar.checkbox("Limit PDF Pages", value=False)
316
- max_pages = None
317
- if limit_pages:
318
- max_pages = st.sidebar.number_input(
319
- "Max Pages to Process",
320
- min_value=1,
321
- max_value=1000,
322
- value=100,
323
- help="Process only first N pages to save time and memory"
324
- )
325
 
326
- # Text compression
327
- use_compression = st.sidebar.checkbox(
328
- "Enable Text Compression",
329
- value=True,
330
- help="Compress extracted text to reduce memory usage"
331
- )
332
 
333
- # Processing method for PDFs
334
- pdf_method = st.sidebar.selectbox(
335
- "PDF Processing Method",
336
- ["PyMuPDF (Recommended)", "PyPDF2 (Fallback)"],
337
- help="PyMuPDF is faster and more reliable for large files"
338
- )
339
 
340
- return {
341
- 'max_size_mb': max_size,
342
- 'max_pages': max_pages,
343
- 'use_compression': use_compression,
344
- 'pdf_method': pdf_method
345
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
- # Memory management utilities
348
- def get_memory_usage():
349
- """Get current memory usage (if psutil is available)"""
350
  try:
351
- import psutil
352
- process = psutil.Process()
353
- memory_mb = process.memory_info().rss / 1024 / 1024
354
- return f"{memory_mb:.1f} MB"
355
- except ImportError:
356
- return "N/A"
357
-
358
- def clear_large_variables():
359
- """Clear large variables from session state to free memory"""
360
- keys_to_clear = []
361
- for key in st.session_state.keys():
362
- if key.startswith('temp_') or key.endswith('_large'):
363
- keys_to_clear.append(key)
364
-
365
- for key in keys_to_clear:
366
- del st.session_state[key]
367
-
368
- # Force garbage collection
369
- import gc
370
- gc.collect()
 
 
 
 
371
 
372
- # Example usage in your main function:
373
- def enhanced_file_upload_section():
374
- """Enhanced file upload section with better large file handling"""
375
-
376
- # Add processing options
377
- processing_options = add_file_processing_options()
378
 
379
- # Memory usage display
380
- memory_usage = get_memory_usage()
381
- st.sidebar.write(f"πŸ’Ύ Memory Usage: {memory_usage}")
 
 
382
 
383
- # Clear memory button
384
- if st.sidebar.button("🧹 Clear Memory"):
385
- clear_large_variables()
386
- st.sidebar.success("Memory cleared!")
 
 
 
387
 
388
- # File upload with dynamic size limit
389
- st.sidebar.info(f"πŸ“‹ **File Requirements:**\n- Max size: {processing_options['max_size_mb']}MB per file\n- Formats: PDF, DOCX, TXT, XLSX")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
- uploaded_files = st.file_uploader(
392
- "Choose files",
393
- accept_multiple_files=True,
394
- type=['pdf', 'docx', 'txt', 'xlsx'],
395
- help=f"Supported formats: PDF, DOCX, TXT, XLSX (Max {processing_options['max_size_mb']}MB each)"
396
- )
397
 
398
- if uploaded_files:
399
- valid_files = []
400
- for file in uploaded_files:
401
- is_valid, message = validate_file(file, processing_options['max_size_mb'])
402
- if is_valid:
403
- valid_files.append(file)
404
- else:
405
- st.error(f"❌ {message}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
- if valid_files:
408
- st.success(f"βœ… {len(valid_files)} valid files ready!")
 
 
 
 
 
 
 
 
 
 
409
 
410
- # Show processing options
411
- if processing_options['max_pages']:
412
- st.info(f"πŸ“„ Will process first {processing_options['max_pages']} pages of PDF files")
413
 
414
- if st.button("πŸ”„ Process Documents", type="primary"):
415
- process_files_with_options(valid_files, processing_options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
- def process_files_with_options(files, options):
418
- """Process files with the specified options"""
419
- progress_bar = st.progress(0)
420
- status_text = st.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
- for i, file in enumerate(files):
423
- status_text.text(f"Processing {file.name}...")
424
 
425
- try:
426
- # Use enhanced processing function
427
- text, filename, analysis = process_document_enhanced(
428
- file,
429
- max_size_mb=options['max_size_mb'],
430
- max_pages=options['max_pages'],
431
- use_compression=options['use_compression']
432
- )
433
 
434
- # Store in session state
435
- st.session_state.processed_docs[filename] = {
436
- 'text': text,
437
- 'analysis': analysis,
438
- 'processed_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
439
- 'processing_options': options
440
- }
441
 
442
- st.success(f"βœ… {filename} - {analysis['word_count']:,} words, {analysis['total_pages']} pages")
 
 
 
 
 
 
443
 
444
- except Exception as e:
445
- st.error(f"❌ Error processing {file.name}: {str(e)}")
 
 
 
 
 
 
 
446
 
447
- progress_bar.progress((i + 1) / len(files))
448
-
449
- status_text.text("βœ… Processing complete!")
450
- st.balloons()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import os
3
  import tempfile
4
+
5
+ # Fix cache permission issues in HF Spaces
6
+ os.environ['TRANSFORMERS_CACHE'] = tempfile.gettempdir()
7
+ os.environ['HF_HOME'] = tempfile.gettempdir()
8
+ os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
9
+
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
11
+ import torch
12
  import PyPDF2
13
+ import docx
14
+ import pandas as pd
15
+ from sentence_transformers import SentenceTransformer
16
+ import chromadb
17
+ from chromadb.config import Settings
18
+ import tempfile
19
+ import uuid
20
+ import re
21
+ from datetime import datetime
22
 
23
+ # Page config
24
+ st.set_page_config(
25
+ page_title="FinanceGPT - Enterprise AI Assistant",
26
+ page_icon="πŸ’°",
27
+ layout="wide",
28
+ initial_sidebar_state="expanded"
29
+ )
30
+
31
+ # Custom CSS
32
+ st.markdown("""
33
+ <style>
34
+ .main-header {
35
+ font-size: 3rem;
36
+ color: #1f77b4;
37
+ text-align: center;
38
+ margin-bottom: 2rem;
39
+ }
40
+ .chat-message {
41
+ padding: 1rem;
42
+ border-radius: 0.5rem;
43
+ margin: 1rem 0;
44
+ background-color: #f0f2f6;
45
+ }
46
+ .source-box {
47
+ background-color: #e8f4f8;
48
+ padding: 1rem;
49
+ border-radius: 0.5rem;
50
+ border-left: 4px solid #1f77b4;
51
+ }
52
+ .doc-summary {
53
+ background-color: #f8f9fa;
54
+ padding: 1rem;
55
+ border-radius: 0.5rem;
56
+ border: 1px solid #dee2e6;
57
+ margin: 1rem 0;
58
+ }
59
+ .analysis-card {
60
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
61
+ color: white;
62
+ padding: 1rem;
63
+ border-radius: 0.5rem;
64
+ margin: 0.5rem 0;
65
+ }
66
+ .metric-card {
67
+ background-color: #ffffff;
68
+ padding: 1rem;
69
+ border-radius: 0.5rem;
70
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
71
+ text-align: center;
72
+ margin: 0.5rem 0;
73
+ }
74
+ </style>
75
+ """, unsafe_allow_html=True)
76
+
77
+ # Initialize session state
78
+ if 'processed_docs' not in st.session_state:
79
+ st.session_state.processed_docs = {}
80
+ if 'analysis_cache' not in st.session_state:
81
+ st.session_state.analysis_cache = {}
82
+
83
+ # Document analysis types
84
+ ANALYSIS_TYPES = {
85
+ "πŸ“Š Financial Summary": {
86
+ "description": "Extract key financial metrics, ratios, and performance indicators",
87
+ "keywords": ["revenue", "profit", "loss", "assets", "liabilities", "cash flow", "ROI", "margin"],
88
+ "icon": "πŸ“Š"
89
+ },
90
+ "⚠️ Risk Analysis": {
91
+ "description": "Identify potential risks, threats, and vulnerability factors",
92
+ "keywords": ["risk", "threat", "vulnerability", "exposure", "mitigation", "hedge", "insurance"],
93
+ "icon": "⚠️"
94
+ },
95
+ "πŸ“ˆ Market Trends": {
96
+ "description": "Analyze market conditions, trends, and competitive landscape",
97
+ "keywords": ["market", "trend", "growth", "competition", "industry", "outlook", "forecast"],
98
+ "icon": "πŸ“ˆ"
99
+ },
100
+ "βœ… Compliance Check": {
101
+ "description": "Review regulatory compliance and legal requirements",
102
+ "keywords": ["compliance", "regulation", "legal", "audit", "governance", "policy", "standard"],
103
+ "icon": "βœ…"
104
+ },
105
+ "πŸ’‘ Investment Insights": {
106
+ "description": "Extract investment recommendations and opportunities",
107
+ "keywords": ["investment", "opportunity", "recommendation", "valuation", "return", "portfolio"],
108
+ "icon": "πŸ’‘"
109
+ },
110
+ "πŸ“‹ Executive Summary": {
111
+ "description": "Generate high-level overview and key takeaways",
112
+ "keywords": ["summary", "overview", "highlights", "conclusion", "recommendation", "action"],
113
+ "icon": "πŸ“‹"
114
+ },
115
+ "πŸ” Detailed Analysis": {
116
+ "description": "Comprehensive deep-dive analysis of all content",
117
+ "keywords": ["analysis", "detailed", "comprehensive", "thorough", "complete", "full"],
118
+ "icon": "πŸ”"
119
+ },
120
+ "πŸ“Š Data Extraction": {
121
+ "description": "Extract tables, numbers, and structured data",
122
+ "keywords": ["data", "table", "number", "figure", "statistic", "metric", "KPI"],
123
+ "icon": "πŸ“Š"
124
+ }
125
+ }
126
+
127
+ @st.cache_resource
128
+ def load_models():
129
+ """Load and cache all models"""
130
+ try:
131
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
132
+ model_name = "microsoft/DialoGPT-medium"
133
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
134
+ if tokenizer.pad_token is None:
135
+ tokenizer.pad_token = tokenizer.eos_token
136
+ model = AutoModelForCausalLM.from_pretrained(model_name)
137
+
138
+ client = chromadb.Client()
139
+ try:
140
+ collection = client.get_collection("documents")
141
+ except:
142
+ collection = client.create_collection(
143
+ name="documents",
144
+ metadata={"hnsw:space": "cosine"}
145
+ )
146
+
147
+ return embedding_model, tokenizer, model, collection
148
+ except Exception as e:
149
+ st.error(f"Error loading models: {str(e)}")
150
+ return None, None, None, None
151
+
152
+ def validate_file(uploaded_file):
153
+ """Validate uploaded file"""
154
+ max_size = 50 * 1024 * 1024 # 50MB
155
  if uploaded_file.size > max_size:
156
+ return False, f"File {uploaded_file.name} is too large. Maximum size is 50MB."
157
 
158
  allowed_extensions = ['pdf', 'docx', 'txt', 'xlsx', 'xls']
159
  file_extension = uploaded_file.name.split('.')[-1].lower()
 
162
 
163
  return True, "Valid file"
164
 
165
+ def analyze_document_structure(text, filename):
166
+ """Analyze document structure and extract metadata"""
167
+ analysis = {
168
+ 'filename': filename,
169
+ 'word_count': len(text.split()),
170
+ 'char_count': len(text),
171
+ 'estimated_pages': len(text) // 2000, # Rough estimate
172
+ 'has_financial_data': bool(re.search(r'\$|€|Β£|β‚Ή|\d+\.\d+%|\d+,\d+', text)),
173
+ 'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
174
+ 'sections': [],
175
+ 'key_terms': [],
176
+ 'document_type': 'Unknown'
177
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
+ # Detect document type
180
+ if any(term in text.lower() for term in ['financial statement', 'balance sheet', 'income statement']):
181
+ analysis['document_type'] = 'Financial Statement'
182
+ elif any(term in text.lower() for term in ['annual report', '10-k', '10-q']):
183
+ analysis['document_type'] = 'Annual Report'
184
+ elif any(term in text.lower() for term in ['investment', 'portfolio', 'fund']):
185
+ analysis['document_type'] = 'Investment Document'
186
+ elif any(term in text.lower() for term in ['contract', 'agreement', 'terms']):
187
+ analysis['document_type'] = 'Legal Document'
188
+
189
+ # Extract sections (headers)
190
+ headers = re.findall(r'^[A-Z][A-Za-z\s]{10,50}$', text, re.MULTILINE)
191
+ analysis['sections'] = headers[:10] # Top 10 sections
192
+
193
+ # Extract key financial terms
194
+ financial_terms = re.findall(r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin)\b', text, re.IGNORECASE)
195
+ analysis['key_terms'] = list(set(financial_terms))[:15]
 
196
 
197
+ return analysis
198
 
199
  @st.cache_data
200
+ def process_document(uploaded_file):
201
+ """Process uploaded document with enhanced analysis"""
202
+ is_valid, message = validate_file(uploaded_file)
 
 
203
  if not is_valid:
204
  raise ValueError(message)
205
 
 
206
  try:
207
  with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
208
  tmp_file.write(uploaded_file.getvalue())
 
213
  try:
214
  file_extension = uploaded_file.name.split('.')[-1].lower()
215
  text = ""
 
216
 
217
  if file_extension == 'pdf':
 
 
 
 
 
 
 
 
 
 
218
  try:
219
+ with open(tmp_path, 'rb') as file:
220
+ reader = PyPDF2.PdfReader(file)
221
+ for page in reader.pages:
222
+ text += page.extract_text() + "\n"
 
 
223
  except Exception as e:
224
+ raise ValueError(f"Error reading PDF: {str(e)}")
225
+
 
 
 
 
 
 
226
  elif file_extension == 'docx':
 
227
  try:
 
228
  doc = docx.Document(tmp_path)
229
+ for paragraph in doc.paragraphs:
 
 
 
 
230
  text += paragraph.text + "\n"
 
 
 
 
 
 
 
 
231
  except Exception as e:
232
  raise ValueError(f"Error reading DOCX: {str(e)}")
233
 
 
234
  elif file_extension == 'txt':
235
  try:
236
  with open(tmp_path, 'r', encoding='utf-8') as file:
 
238
  except UnicodeDecodeError:
239
  with open(tmp_path, 'r', encoding='latin-1') as file:
240
  text = file.read()
241
+ except Exception as e:
242
+ raise ValueError(f"Error reading TXT: {str(e)}")
243
 
244
  elif file_extension in ['xlsx', 'xls']:
245
  try:
 
246
  df = pd.read_excel(tmp_path)
247
  text = df.to_string()
248
  except Exception as e:
 
251
  if not text.strip():
252
  raise ValueError("No text content found in the file")
253
 
254
+ # Analyze document structure
255
+ analysis = analyze_document_structure(text, uploaded_file.name)
 
 
 
 
 
 
256
 
257
  return text, uploaded_file.name, analysis
258
 
259
  finally:
 
260
  try:
261
  if os.path.exists(tmp_path):
262
  os.remove(tmp_path)
263
  except:
264
  pass
265
 
266
+ def generate_analysis_by_type(text, analysis_type, analysis_info):
267
+ """Generate specific analysis based on type"""
268
+ keywords = analysis_info['keywords']
269
+ description = analysis_info['description']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ # Find relevant sections based on keywords
272
+ relevant_sections = []
273
  text_lower = text.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
+ for keyword in keywords:
276
+ if keyword in text_lower:
277
+ # Find context around keywords
278
+ pattern = rf'.{0,200}\b{keyword}\b.{0,200}'
279
+ matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
280
+ relevant_sections.extend(matches[:3]) # Max 3 matches per keyword
281
 
282
+ if not relevant_sections:
283
+ return f"No specific information found for {analysis_type} in this document."
 
 
 
 
 
284
 
285
+ # Create structured analysis
286
+ analysis_result = f"""
287
+ ## {analysis_type}
288
+
289
+ **Analysis Focus**: {description}
290
 
291
+ **Key Findings**:
292
+ """
 
 
293
 
294
+ for i, section in enumerate(relevant_sections[:5], 1):
295
+ cleaned_section = re.sub(r'\s+', ' ', section.strip())
296
+ analysis_result += f"\n{i}. {cleaned_section[:300]}...\n"
 
 
 
 
 
 
297
 
298
+ analysis_result += f"\n**Summary**: Based on the document analysis, {len(relevant_sections)} relevant sections were identified related to {analysis_type.lower()}."
 
 
 
 
 
 
 
 
 
 
299
 
300
+ return analysis_result
301
+
302
+ def chunk_text(text, chunk_size=1000, overlap=200):
303
+ """Split text into chunks"""
304
+ if not text or not text.strip():
305
+ return []
306
 
307
+ chunks = []
308
+ start = 0
 
 
 
 
309
 
310
+ while start < len(text):
311
+ end = start + chunk_size
312
+ chunk = text[start:end]
313
+
314
+ if end < len(text):
315
+ last_period = chunk.rfind('.')
316
+ if last_period > chunk_size * 0.7:
317
+ end = start + last_period + 1
318
+ chunk = text[start:end]
319
+
320
+ if chunk.strip():
321
+ chunks.append(chunk.strip())
322
+
323
+ start = end - overlap
324
+
325
+ if start >= len(text):
326
+ break
327
+
328
+ return chunks
329
 
330
+ def search_documents(query, collection, embedding_model, n_results=3):
331
+ """Search for relevant document chunks"""
 
332
  try:
333
+ if collection.count() == 0:
334
+ return []
335
+
336
+ query_embedding = embedding_model.encode([query]).tolist()
337
+
338
+ results = collection.query(
339
+ query_embeddings=query_embedding,
340
+ n_results=min(n_results, collection.count()),
341
+ include=['documents', 'metadatas', 'distances']
342
+ )
343
+
344
+ search_results = []
345
+ if results['documents'] and results['documents'][0]:
346
+ for i in range(len(results['documents'][0])):
347
+ search_results.append({
348
+ 'content': results['documents'][0][i],
349
+ 'metadata': results['metadatas'][0][i],
350
+ 'score': 1 - results['distances'][0][i] if results['distances'][0][i] else 1.0
351
+ })
352
+
353
+ return search_results
354
+ except Exception as e:
355
+ st.error(f"Search error: {str(e)}")
356
+ return []
357
 
358
+ def main():
359
+ # Header
360
+ st.markdown('<h1 class="main-header">πŸ’° FinanceGPT - Enhanced Enterprise AI Assistant</h1>', unsafe_allow_html=True)
 
 
 
361
 
362
+ st.markdown("""
363
+ <div style="text-align: center; font-size: 1.2rem; color: #666; margin-bottom: 2rem;">
364
+ πŸš€ Powered by IBM Granite Models | πŸ“Š Advanced Document Intelligence | πŸ”’ Secure & Compliant
365
+ </div>
366
+ """, unsafe_allow_html=True)
367
 
368
+ # Load models
369
+ with st.spinner("πŸ”„ Loading AI models..."):
370
+ models = load_models()
371
+ if models[0] is None:
372
+ st.error("Failed to load AI models. Please refresh the page.")
373
+ return
374
+ embedding_model, tokenizer, model, collection = models
375
 
376
+ # Sidebar for document management
377
+ with st.sidebar:
378
+ st.header("πŸ“ Enhanced Document Management")
379
+
380
+ # File upload section
381
+ st.markdown("### πŸ“€ Upload Documents")
382
+ st.info("πŸ“‹ **File Requirements:**\n- Max size: 50MB per file\n- Formats: PDF, DOCX, TXT, XLSX")
383
+
384
+ uploaded_files = st.file_uploader(
385
+ "Choose files",
386
+ accept_multiple_files=True,
387
+ type=['pdf', 'docx', 'txt', 'xlsx'],
388
+ help="Supported formats: PDF, DOCX, TXT, XLSX (Max 50MB each)"
389
+ )
390
+
391
+ if uploaded_files:
392
+ valid_files = []
393
+ for file in uploaded_files:
394
+ is_valid, message = validate_file(file)
395
+ if is_valid:
396
+ valid_files.append(file)
397
+ else:
398
+ st.error(f"❌ {message}")
399
+
400
+ if valid_files:
401
+ st.success(f"βœ… {len(valid_files)} valid files ready!")
402
+
403
+ if st.button("πŸ”„ Process Documents", type="primary"):
404
+ progress_bar = st.progress(0)
405
+ status_text = st.empty()
406
+
407
+ for i, file in enumerate(valid_files):
408
+ status_text.text(f"Processing {file.name}...")
409
+
410
+ try:
411
+ text, filename, analysis = process_document(file)
412
+
413
+ # Store document analysis
414
+ st.session_state.processed_docs[filename] = {
415
+ 'text': text,
416
+ 'analysis': analysis,
417
+ 'processed_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
418
+ }
419
+
420
+ # Create and store chunks
421
+ chunks = chunk_text(text)
422
+ if chunks:
423
+ for j, chunk in enumerate(chunks):
424
+ try:
425
+ chunk_id = f"{filename}_{j}_{uuid.uuid4().hex[:8]}"
426
+ embedding = embedding_model.encode([chunk]).tolist()
427
+
428
+ collection.add(
429
+ embeddings=embedding,
430
+ documents=[chunk],
431
+ metadatas=[{'filename': filename, 'chunk_id': j}],
432
+ ids=[chunk_id]
433
+ )
434
+ except Exception as e:
435
+ continue
436
+
437
+ st.success(f"βœ… {filename}")
438
+
439
+ except Exception as e:
440
+ st.error(f"❌ Error processing {file.name}: {str(e)}")
441
+
442
+ progress_bar.progress((i + 1) / len(valid_files))
443
+
444
+ status_text.text("βœ… Processing complete!")
445
+ st.balloons()
446
+
447
+ # Document analysis section
448
+ if st.session_state.processed_docs:
449
+ st.markdown("---")
450
+ st.markdown("### πŸ“Š Document Analysis Options")
451
+
452
+ # Select document
453
+ doc_names = list(st.session_state.processed_docs.keys())
454
+ selected_doc = st.selectbox("Select Document:", doc_names)
455
+
456
+ if selected_doc:
457
+ doc_info = st.session_state.processed_docs[selected_doc]
458
+
459
+ # Document overview
460
+ st.markdown("#### πŸ“‹ Document Overview")
461
+ analysis = doc_info['analysis']
462
+
463
+ col1, col2 = st.columns(2)
464
+ with col1:
465
+ st.metric("Word Count", f"{analysis['word_count']:,}")
466
+ st.metric("Pages (Est.)", analysis['estimated_pages'])
467
+
468
+ with col2:
469
+ st.metric("Document Type", analysis['document_type'])
470
+ financial_status = "βœ… Yes" if analysis['has_financial_data'] else "❌ No"
471
+ st.write(f"**Financial Data**: {financial_status}")
472
+
473
+ # Key terms
474
+ if analysis['key_terms']:
475
+ st.markdown("**Key Terms Found:**")
476
+ st.write(", ".join(analysis['key_terms'][:10]))
477
+
478
+ # Analysis type selection
479
+ st.markdown("#### πŸ” Analysis Types")
480
+ analysis_type = st.selectbox(
481
+ "Choose Analysis Type:",
482
+ list(ANALYSIS_TYPES.keys()),
483
+ format_func=lambda x: f"{ANALYSIS_TYPES[x]['icon']} {x.split(' ', 1)[1]}"
484
+ )
485
+
486
+ if st.button(f"πŸš€ Generate {analysis_type}", use_container_width=True):
487
+ cache_key = f"{selected_doc}_{analysis_type}"
488
+
489
+ if cache_key not in st.session_state.analysis_cache:
490
+ with st.spinner(f"Generating {analysis_type}..."):
491
+ analysis_result = generate_analysis_by_type(
492
+ doc_info['text'],
493
+ analysis_type,
494
+ ANALYSIS_TYPES[analysis_type]
495
+ )
496
+ st.session_state.analysis_cache[cache_key] = analysis_result
497
+
498
+ # Display in main area
499
+ st.session_state.current_analysis = st.session_state.analysis_cache[cache_key]
500
+ st.session_state.current_analysis_type = analysis_type
501
 
502
+ # Main content area
503
+ col1, col2 = st.columns([2, 1])
 
 
 
 
504
 
505
+ with col1:
506
+ # Display analysis results if available
507
+ if hasattr(st.session_state, 'current_analysis'):
508
+ st.markdown(f"## {st.session_state.current_analysis_type}")
509
+ st.markdown(f'<div class="analysis-card">{st.session_state.current_analysis}</div>', unsafe_allow_html=True)
510
+
511
+ # Clear analysis button
512
+ if st.button("πŸ—‘οΈ Clear Analysis"):
513
+ if hasattr(st.session_state, 'current_analysis'):
514
+ del st.session_state.current_analysis
515
+ if hasattr(st.session_state, 'current_analysis_type'):
516
+ del st.session_state.current_analysis_type
517
+ st.rerun()
518
+
519
+ st.header("πŸ’¬ Interactive Q&A")
520
+
521
+ # Smart question suggestions
522
+ if st.session_state.processed_docs:
523
+ with st.expander("πŸ’‘ Smart Question Suggestions"):
524
+ # Generate context-aware questions
525
+ doc_types = set(doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values())
526
+
527
+ smart_questions = []
528
+ if 'Financial Statement' in doc_types:
529
+ smart_questions.extend([
530
+ "What are the key financial ratios mentioned?",
531
+ "Analyze the profitability trends",
532
+ "What are the major expense categories?"
533
+ ])
534
+ if 'Investment Document' in doc_types:
535
+ smart_questions.extend([
536
+ "What are the investment recommendations?",
537
+ "What risks are associated with these investments?",
538
+ "What is the expected return on investment?"
539
+ ])
540
+ if 'Annual Report' in doc_types:
541
+ smart_questions.extend([
542
+ "Summarize the company's performance this year",
543
+ "What are the future growth strategies?",
544
+ "What challenges does the company face?"
545
+ ])
546
+
547
+ # Default questions if no specific type detected
548
+ if not smart_questions:
549
+ smart_questions = [
550
+ "What are the key points in this document?",
551
+ "Summarize the main findings",
552
+ "What are the most important numbers mentioned?"
553
+ ]
554
+
555
+ for question in smart_questions[:6]:
556
+ if st.button(question, key=f"smart_{question}", use_container_width=True):
557
+ st.session_state.query = question
558
 
559
+ # Query input
560
+ query = st.text_area(
561
+ "Enter your question:",
562
+ value=st.session_state.get('query', ''),
563
+ placeholder="e.g., What are the main financial risks identified in the documents?",
564
+ height=100
565
+ )
566
+
567
+ if st.button("πŸ” Ask Question", type="primary", use_container_width=True):
568
+ if not query:
569
+ st.warning("⚠️ Please enter a question!")
570
+ return
571
 
572
+ if collection.count() == 0:
573
+ st.warning("⚠️ Please upload and process some documents first!")
574
+ return
575
 
576
+ with st.spinner("πŸ€– Analyzing documents and generating response..."):
577
+ try:
578
+ search_results = search_documents(query, collection, embedding_model)
579
+
580
+ if search_results:
581
+ # Enhanced response generation
582
+ context = ""
583
+ source_files = set()
584
+
585
+ for i, chunk in enumerate(search_results):
586
+ filename = chunk['metadata'].get('filename', 'Unknown')
587
+ source_files.add(filename)
588
+ context += f"[Source {i+1}: {filename}]\n{chunk['content'][:400]}...\n\n"
589
+
590
+ response = f"""
591
+ ### πŸ€– AI Analysis Results
592
+
593
+ **Query**: {query}
594
+
595
+ **Key Findings**:
596
+ {context[:1000]}...
597
+
598
+ **Summary**: Based on analysis of {len(search_results)} relevant sections from {len(source_files)} document(s), the information above directly addresses your question.
599
 
600
+ **Documents Analyzed**: {', '.join(source_files)}
601
+ """
602
+
603
+ st.markdown(response)
604
+
605
+ # Enhanced source display
606
+ st.markdown("### πŸ“š Detailed Sources")
607
+ for i, result in enumerate(search_results):
608
+ score_percent = f"{result['score']:.1%}"
609
+ filename = result['metadata'].get('filename', 'Unknown')
610
+
611
+ with st.expander(f"πŸ“„ Source {i+1}: {filename} (Relevance: {score_percent})"):
612
+ st.markdown(f'<div class="source-box">{result["content"]}</div>', unsafe_allow_html=True)
613
+ else:
614
+ st.error("❌ No relevant information found in the uploaded documents.")
615
+
616
+ except Exception as e:
617
+ st.error(f"❌ Error processing your question: {str(e)}")
618
 
619
+ with col2:
620
+ st.header("πŸ“Š Dashboard")
621
 
622
+ # Document statistics
623
+ if st.session_state.processed_docs:
624
+ st.markdown("### πŸ“ˆ Document Statistics")
 
 
 
 
 
625
 
626
+ total_words = sum(doc['analysis']['word_count'] for doc in st.session_state.processed_docs.values())
627
+ total_pages = sum(doc['analysis']['estimated_pages'] for doc in st.session_state.processed_docs.values())
628
+ doc_types = [doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values()]
 
 
 
 
629
 
630
+ col_a, col_b = st.columns(2)
631
+ with col_a:
632
+ st.metric("πŸ“„ Documents", len(st.session_state.processed_docs))
633
+ st.metric("πŸ“Š Total Words", f"{total_words:,}")
634
+ with col_b:
635
+ st.metric("πŸ“‘ Total Pages", total_pages)
636
+ st.metric("πŸ—‚οΈ Document Types", len(set(doc_types)))
637
 
638
+ # Document type breakdown
639
+ if doc_types:
640
+ st.markdown("**Document Types:**")
641
+ type_counts = {}
642
+ for doc_type in doc_types:
643
+ type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
644
+
645
+ for doc_type, count in type_counts.items():
646
+ st.write(f"β€’ {doc_type}: {count}")
647
 
648
+ # Project info
649
+ st.markdown("---")
650
+ st.header("🎯 Project Info")
651
+
652
+ st.markdown("""
653
+ ### **Built For IBM Hackathon**
654
+
655
+ **πŸ”§ Technology Stack:**
656
+ - 🧠 IBM Granite Models
657
+ - πŸ” RAG (Retrieval-Augmented Generation)
658
+ - πŸ“Š Streamlit UI
659
+ - πŸ—„οΈ ChromaDB Vector Database
660
+ - πŸ”’ Enterprise Security
661
+
662
+ **πŸ’Ό Analysis Types:**
663
+ - πŸ“Š Financial Summary
664
+ - ⚠️ Risk Analysis
665
+ - πŸ“ˆ Market Trends
666
+ - βœ… Compliance Check
667
+ - πŸ’‘ Investment Insights
668
+ - πŸ“‹ Executive Summary
669
+ - πŸ” Detailed Analysis
670
+ - πŸ“Š Data Extraction
671
+ """)
672
+
673
+ # Statistics
674
+ try:
675
+ doc_count = collection.count()
676
+ st.metric("πŸ”— Vector Chunks", doc_count)
677
+ except:
678
+ st.metric("πŸ”— Vector Chunks", 0)
679
+
680
+ if __name__ == "__main__":
681
+ main()