sukhrobnurali commited on
Commit
a921556
·
1 Parent(s): f10f47f

v1 Application is Ready

Browse files
Files changed (9) hide show
  1. .dockerignore +39 -0
  2. .env.example +1 -0
  3. .gitignore +9 -0
  4. Dockerfile +41 -0
  5. README.md +130 -6
  6. app.py +297 -0
  7. criteria.py +160 -0
  8. document_processor.py +153 -0
  9. requirements.txt +5 -0
.dockerignore ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Environment
9
+ .env
10
+ .venv
11
+ venv/
12
+ ENV/
13
+
14
+ # IDE
15
+ .vscode/
16
+ .idea/
17
+ *.swp
18
+ *.swo
19
+
20
+ # Testing
21
+ .pytest_cache/
22
+ .coverage
23
+ htmlcov/
24
+
25
+ # Documentation (already copied in Dockerfile)
26
+ QUICKSTART.md
27
+ DEPLOYMENT.md
28
+
29
+ # Git
30
+ .git/
31
+ .gitignore
32
+
33
+ # PDFs and uploads
34
+ *.pdf
35
+ uploads/
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=your_openai_api_key_here
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ .env
5
+ .venv
6
+ venv/
7
+ *.pdf
8
+ .streamlit/
9
+ uploads/
Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.11 slim image for smaller size
2
+ FROM python:3.11-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies required for PDF processing
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ curl \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements first for better layer caching
14
+ COPY requirements.txt .
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy application files
20
+ COPY app.py .
21
+ COPY document_processor.py .
22
+ COPY criteria.py .
23
+ COPY README.md .
24
+
25
+ # Create directory for temporary file uploads
26
+ RUN mkdir -p /app/uploads
27
+
28
+ # Expose port 7860 (Hugging Face Spaces standard port)
29
+ EXPOSE 7860
30
+
31
+ # Set environment variables for Streamlit
32
+ ENV STREAMLIT_SERVER_PORT=7860
33
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
34
+ ENV STREAMLIT_SERVER_HEADLESS=true
35
+ ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
36
+
37
+ # Health check
38
+ HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health || exit 1
39
+
40
+ # Run the Streamlit app
41
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
README.md CHANGED
@@ -1,11 +1,135 @@
1
  ---
2
- title: Financial Document Analyzer
3
- emoji: 🐢
4
  colorFrom: blue
5
- colorTo: purple
6
- sdk: docker
 
 
7
  pinned: false
8
- short_description: An AI-powered financial document analyzer that screens compa
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Intelligent Investment Screener
3
+ emoji: 📊
4
  colorFrom: blue
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.31.0
8
+ app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # Intelligent Investment Screener
14
+
15
+ An AI-powered financial document analyzer that screens company annual reports against specific investment criteria using RAG (Retrieval-Augmented Generation).
16
+
17
+ ## What It Does
18
+
19
+ Instead of manually reading 100-page annual reports to find specific financial metrics, this tool:
20
+
21
+ 1. **Accepts** company financial documents (10-K, Annual Reports)
22
+ 2. **Extracts** key financial metrics (debt ratios, revenue breakdown, etc.)
23
+ 3. **Analyzes** them against customizable investment criteria
24
+ 4. **Returns** a Pass/Fail decision with **citations** (page numbers and sections)
25
+
26
+ ## Key Features
27
+
28
+ ### Citation-Based Analysis
29
+ Every finding includes:
30
+ - Exact page number
31
+ - Specific section or table name
32
+ - Relevance score
33
+
34
+ This transforms the tool from a "magic box" to a **trusted, verifiable assistant**.
35
+
36
+ ### Multiple Screening Criteria
37
+
38
+ 1. **Shariah Compliance**: Islamic finance screening
39
+ - Debt ratio < 33%
40
+ - Interest income < 5%
41
+ - No prohibited activities (alcohol, gambling, etc.)
42
+
43
+ 2. **ESG (Environmental, Social, Governance)**: Sustainable investing
44
+ - Carbon emissions disclosure
45
+ - Board diversity > 30%
46
+ - No environmental violations
47
+ - Labor practice compliance
48
+
49
+ 3. **Value Investing**: Traditional value metrics
50
+ - P/E ratio < 15
51
+ - Debt to Equity < 0.5
52
+ - Positive free cash flow
53
+ - Revenue growth > 5%
54
+
55
+ ## Technical Architecture
56
+
57
+ ### Tech Stack
58
+ - **Frontend**: Streamlit
59
+ - **LLM**: OpenAI GPT-4o-mini
60
+ - **RAG Framework**: LlamaIndex
61
+
62
+ ### How It Works
63
+
64
+ ```
65
+ PDF Upload → LlamaIndex Parser → Vector Index → OpenAI Analysis → Cited Results
66
+ ```
67
+
68
+ 1. **Document Loading**: LlamaIndex parses PDF and preserves page metadata
69
+ 2. **Vector Indexing**: Creates searchable embeddings of document chunks
70
+ 3. **Criteria Analysis**: OpenAI GPT-4o-mini analyzes relevant sections against rules
71
+ 4. **Citation Extraction**: Page numbers and sections are tracked throughout
72
+ 5. **Results Display**: Pass/Fail with verifiable citations
73
+
74
+ ## Quick Start
75
+
76
+ ### Running Locally
77
+
78
+ ```bash
79
+ # Clone the repository
80
+ git clone <your-repo-url>
81
+ cd investment-screener
82
+
83
+ # Install dependencies
84
+ pip install -r requirements.txt
85
+
86
+ # Set up environment variables
87
+ cp .env.example .env
88
+ # Add your OPENAI_API_KEY to .env
89
+
90
+ # Run the app
91
+ streamlit run app.py
92
+ ```
93
+
94
+ ### Get an OpenAI API Key
95
+
96
+ 1. Visit [OpenAI Platform](https://platform.openai.com/api-keys)
97
+ 2. Sign up or log in
98
+ 3. Click "Create new secret key"
99
+ 4. Copy the key and add it to `.env` or enter it in the app sidebar
100
+
101
+ ## Usage
102
+
103
+ 1. **Select Screening Criteria**: Choose from Shariah, ESG, or Value Investing
104
+ 2. **Upload Document**: Upload an annual report or 10-K filing (PDF)
105
+ 3. **Analyze**: Click the analyze button
106
+ 4. **Review Results**:
107
+ - Overall Pass/Fail decision
108
+ - Detailed metric-by-metric breakdown
109
+ - Citations with page numbers for verification
110
+
111
+
112
+ ## Example Use Cases
113
+
114
+ ### Shariah Compliance Screening
115
+ Investors following Islamic finance principles need to ensure companies:
116
+ - Don't derive significant income from interest
117
+ - Maintain acceptable debt levels
118
+ - Don't operate in prohibited industries
119
+
120
+ ### ESG Screening
121
+ Socially responsible investors want to verify:
122
+ - Environmental impact disclosures
123
+ - Corporate governance practices
124
+ - Social responsibility metrics
125
+
126
+ ### Value Investing
127
+ Traditional investors need quick access to:
128
+ - Valuation ratios (P/E, P/B)
129
+ - Financial health metrics
130
+ - Growth indicators
131
+
132
+
133
+ ---
134
+
135
+ **Note**: This tool is for informational purposes only. Always verify financial data and consult with qualified financial advisors before making investment decisions.
app.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Intelligent Investment Screener
3
+ A RAG-based application for analyzing company financial reports against investment criteria.
4
+ """
5
+
6
+ import streamlit as st
7
+ import os
8
+ import json
9
+ import tempfile
10
+ from pathlib import Path
11
+ from dotenv import load_dotenv
12
+
13
+ from document_processor import InvestmentDocumentProcessor
14
+ from criteria import CRITERIA_OPTIONS
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # Page config
20
+ st.set_page_config(
21
+ page_title="Investment Screener",
22
+ page_icon="📊",
23
+ layout="wide"
24
+ )
25
+
26
+ # Custom CSS
27
+ st.markdown("""
28
+ <style>
29
+ .main-header {
30
+ font-size: 2.5rem;
31
+ font-weight: bold;
32
+ margin-bottom: 0.5rem;
33
+ }
34
+ .sub-header {
35
+ font-size: 1.2rem;
36
+ color: #666;
37
+ margin-bottom: 2rem;
38
+ }
39
+ .pass-badge {
40
+ background-color: #28a745;
41
+ color: white;
42
+ padding: 0.5rem 1rem;
43
+ border-radius: 0.5rem;
44
+ font-weight: bold;
45
+ display: inline-block;
46
+ margin: 0.5rem 0;
47
+ }
48
+ .fail-badge {
49
+ background-color: #dc3545;
50
+ color: white;
51
+ padding: 0.5rem 1rem;
52
+ border-radius: 0.5rem;
53
+ font-weight: bold;
54
+ display: inline-block;
55
+ margin: 0.5rem 0;
56
+ }
57
+ .citation {
58
+ background-color: #f8f9fa;
59
+ border-left: 4px solid #007bff;
60
+ padding: 1rem;
61
+ margin: 0.5rem 0;
62
+ border-radius: 0.25rem;
63
+ }
64
+ .metric-card {
65
+ background-color: #ffffff;
66
+ padding: 1.5rem;
67
+ border-radius: 0.5rem;
68
+ border: 1px solid #e0e0e0;
69
+ margin: 1rem 0;
70
+ }
71
+ </style>
72
+ """, unsafe_allow_html=True)
73
+
74
+
75
+ def initialize_session_state():
76
+ """Initialize Streamlit session state variables."""
77
+ if 'processor' not in st.session_state:
78
+ st.session_state.processor = None
79
+ if 'analysis_result' not in st.session_state:
80
+ st.session_state.analysis_result = None
81
+ if 'document_loaded' not in st.session_state:
82
+ st.session_state.document_loaded = False
83
+
84
+
85
+ def display_criteria_rules(criteria):
86
+ """Display the rules for selected criteria."""
87
+ st.subheader("Screening Rules")
88
+ for rule in criteria['rules']:
89
+ st.markdown(f"**{rule['name']}**: {rule['description']}")
90
+ st.caption(f"Threshold: {rule['threshold']}")
91
+
92
+
93
+ def display_analysis_result(result, criteria_name):
94
+ """Display analysis results with citations."""
95
+ st.markdown("---")
96
+ st.markdown("## Analysis Results")
97
+
98
+ # Overall pass/fail
99
+ overall_pass = result.get('overall_pass', False)
100
+
101
+ if overall_pass:
102
+ st.markdown('<div class="pass-badge">✓ PASSED - Investment Compatible</div>',
103
+ unsafe_allow_html=True)
104
+ else:
105
+ st.markdown('<div class="fail-badge">✗ FAILED - Does Not Meet Criteria</div>',
106
+ unsafe_allow_html=True)
107
+
108
+ # Summary
109
+ if 'summary' in result:
110
+ st.markdown("### Summary")
111
+ st.info(result['summary'])
112
+
113
+ # Detailed metrics
114
+ st.markdown("### Detailed Analysis")
115
+
116
+ # Remove metadata fields for display
117
+ metrics = {k: v for k, v in result.items()
118
+ if k not in ['overall_pass', 'summary', 'citations', 'source_nodes_count', 'parse_error', 'raw_response']}
119
+
120
+ for metric_name, metric_data in metrics.items():
121
+ if isinstance(metric_data, dict):
122
+ display_metric_card(metric_name, metric_data)
123
+
124
+ # Citations section
125
+ if 'citations' in result and result['citations']:
126
+ st.markdown("### 📚 Citations & Sources")
127
+ st.caption(f"Analysis based on {result.get('source_nodes_count', 0)} relevant document sections")
128
+
129
+ for citation in result['citations'][:5]: # Show top 5 citations
130
+ display_citation(citation)
131
+
132
+ # Debug: Show raw response if parse error
133
+ if result.get('parse_error'):
134
+ with st.expander("Raw LLM Response (Debug)"):
135
+ st.text(result.get('raw_response', 'No response'))
136
+
137
+
138
+ def display_metric_card(metric_name, metric_data):
139
+ """Display a single metric card with citation."""
140
+ # Format metric name
141
+ formatted_name = metric_name.replace('_', ' ').title()
142
+
143
+ # Determine pass/fail
144
+ passed = metric_data.get('pass', metric_data.get('compliant', metric_data.get('disclosed', None)))
145
+
146
+ # Build display
147
+ status_icon = "✓" if passed else "✗"
148
+ status_color = "green" if passed else "red"
149
+
150
+ st.markdown(f"""
151
+ <div class="metric-card">
152
+ <h4 style="color: {status_color};">{status_icon} {formatted_name}</h4>
153
+ """, unsafe_allow_html=True)
154
+
155
+ # Display metric details
156
+ for key, value in metric_data.items():
157
+ if key not in ['pass', 'page', 'location']:
158
+ if isinstance(value, bool):
159
+ value = "Yes" if value else "No"
160
+ st.markdown(f"**{key.replace('_', ' ').title()}**: {value}")
161
+
162
+ # Citation info
163
+ if 'page' in metric_data and 'location' in metric_data:
164
+ st.markdown(f"""
165
+ <div style="margin-top: 1rem; padding: 0.5rem; background-color: #e7f3ff; border-radius: 0.25rem;">
166
+ 📄 <strong>Found on Page {metric_data['page']}</strong><br>
167
+ 📍 Section: {metric_data['location']}
168
+ </div>
169
+ """, unsafe_allow_html=True)
170
+ elif 'page' in metric_data:
171
+ st.markdown(f"📄 **Page {metric_data['page']}**")
172
+
173
+ st.markdown("</div>", unsafe_allow_html=True)
174
+
175
+
176
+ def display_citation(citation):
177
+ """Display a citation box."""
178
+ st.markdown(f"""
179
+ <div class="citation">
180
+ <strong>Page {citation['page']}</strong> (Relevance: {citation['score']:.2%})<br>
181
+ <small>{citation['text_preview']}</small>
182
+ </div>
183
+ """, unsafe_allow_html=True)
184
+
185
+
186
+ def main():
187
+ """Main application."""
188
+ initialize_session_state()
189
+
190
+ # Header
191
+ st.markdown('<div class="main-header">📊 Intelligent Investment Screener</div>',
192
+ unsafe_allow_html=True)
193
+ st.markdown('<div class="sub-header">AI-powered financial document analysis with citations</div>',
194
+ unsafe_allow_html=True)
195
+
196
+ # Sidebar
197
+ with st.sidebar:
198
+ st.markdown("## Configuration")
199
+
200
+ # API Key input
201
+ api_key = os.getenv('OPENAI_API_KEY', '')
202
+ if not api_key:
203
+ api_key = st.text_input(
204
+ "OpenAI API Key",
205
+ type="password",
206
+ help="Get your API key at https://platform.openai.com/api-keys"
207
+ )
208
+
209
+ if not api_key:
210
+ st.warning("Please enter your OpenAI API key to continue.")
211
+ st.stop()
212
+
213
+ # Criteria selection
214
+ st.markdown("## Screening Criteria")
215
+ selected_criteria_name = st.selectbox(
216
+ "Select Investment Strategy",
217
+ options=list(CRITERIA_OPTIONS.keys())
218
+ )
219
+
220
+ criteria = CRITERIA_OPTIONS[selected_criteria_name]
221
+
222
+ with st.expander("View Criteria Details"):
223
+ st.markdown(f"**{criteria['name']}**")
224
+ st.caption(criteria['description'])
225
+ display_criteria_rules(criteria)
226
+
227
+ st.markdown("---")
228
+ st.markdown("### About")
229
+ st.caption("""
230
+ This tool uses RAG (Retrieval-Augmented Generation) to analyze
231
+ financial documents against specific investment criteria.
232
+ All findings include page citations for verification.
233
+ """)
234
+
235
+ # Main content
236
+ col1, col2 = st.columns([1, 1])
237
+
238
+ with col1:
239
+ st.markdown("### Upload Document")
240
+ uploaded_file = st.file_uploader(
241
+ "Upload Annual Report or 10-K Filing (PDF)",
242
+ type=['pdf'],
243
+ help="Upload a company's annual report or SEC 10-K filing"
244
+ )
245
+
246
+ if uploaded_file is not None:
247
+ # Save to temp file
248
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
249
+ tmp_file.write(uploaded_file.getvalue())
250
+ tmp_path = tmp_file.name
251
+
252
+ # Load document
253
+ if not st.session_state.document_loaded or st.session_state.processor is None:
254
+ with st.spinner("Loading and indexing document..."):
255
+ try:
256
+ processor = InvestmentDocumentProcessor(api_key)
257
+ processor.load_pdf(tmp_path)
258
+ st.session_state.processor = processor
259
+ st.session_state.document_loaded = True
260
+
261
+ # Show document info
262
+ doc_info = processor.get_document_summary()
263
+ st.success(f"✓ Document loaded: {doc_info['num_pages']} pages")
264
+
265
+ except Exception as e:
266
+ st.error(f"Error loading document: {str(e)}")
267
+ st.stop()
268
+
269
+ # Clean up temp file
270
+ Path(tmp_path).unlink(missing_ok=True)
271
+
272
+ with col2:
273
+ st.markdown("### Analysis")
274
+
275
+ if st.session_state.document_loaded:
276
+ if st.button("🔍 Analyze Document", type="primary", use_container_width=True):
277
+ with st.spinner(f"Analyzing against {selected_criteria_name} criteria..."):
278
+ try:
279
+ result = st.session_state.processor.analyze_with_criteria(
280
+ criteria['analysis_prompt']
281
+ )
282
+ st.session_state.analysis_result = result
283
+
284
+ except Exception as e:
285
+ st.error(f"Analysis error: {str(e)}")
286
+ st.exception(e)
287
+
288
+ else:
289
+ st.info("Upload a PDF document to begin analysis")
290
+
291
+ # Display results
292
+ if st.session_state.analysis_result is not None:
293
+ display_analysis_result(st.session_state.analysis_result, selected_criteria_name)
294
+
295
+
296
+ if __name__ == "__main__":
297
+ main()
criteria.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Investment screening criteria definitions.
3
+ Each criterion includes rules and the analysis prompt for the LLM.
4
+ """
5
+
6
+ SHARIAH_CRITERIA = {
7
+ "name": "Shariah Compliance",
8
+ "description": "Islamic finance screening for halal investments",
9
+ "rules": [
10
+ {
11
+ "name": "Debt Ratio",
12
+ "threshold": "< 33%",
13
+ "description": "Total debt must be less than 33% of market capitalization or total assets"
14
+ },
15
+ {
16
+ "name": "Interest Income",
17
+ "threshold": "< 5%",
18
+ "description": "Interest-bearing income must be less than 5% of total revenue"
19
+ },
20
+ {
21
+ "name": "Prohibited Activities",
22
+ "threshold": "0%",
23
+ "description": "No involvement in alcohol, gambling, pork products, conventional banking, or adult entertainment"
24
+ },
25
+ {
26
+ "name": "Cash & Interest-Bearing Securities",
27
+ "threshold": "< 33%",
28
+ "description": "Cash and interest-bearing securities must be less than 33% of market cap"
29
+ }
30
+ ],
31
+ "analysis_prompt": """You are a Shariah compliance analyst. Analyze this financial document and extract the following:
32
+
33
+ 1. **Debt Ratio**: Calculate total debt / total assets (or market cap if available). Must be < 33%.
34
+ 2. **Interest Income**: Find interest income / total revenue. Must be < 5%.
35
+ 3. **Prohibited Activities**: Check for revenue from alcohol, gambling, pork, conventional banking, or adult entertainment.
36
+ 4. **Cash Ratio**: Calculate (cash + interest-bearing securities) / total assets. Must be < 33%.
37
+
38
+ For EACH finding, you MUST provide:
39
+ - The exact value or percentage
40
+ - The page number where you found it
41
+ - The specific section or table name (e.g., "Balance Sheet, Note 5")
42
+
43
+ Format your response as JSON:
44
+ {
45
+ "debt_ratio": {"value": "X%", "page": N, "location": "Section name", "pass": true/false},
46
+ "interest_income": {"value": "X%", "page": N, "location": "Section name", "pass": true/false},
47
+ "prohibited_activities": {"found": true/false, "details": "...", "page": N, "location": "Section name", "pass": true/false},
48
+ "cash_ratio": {"value": "X%", "page": N, "location": "Section name", "pass": true/false},
49
+ "overall_pass": true/false,
50
+ "summary": "Brief explanation"
51
+ }
52
+
53
+ If you cannot find specific information, state "Not found in document" but still cite where you looked."""
54
+ }
55
+
56
+ ESG_CRITERIA = {
57
+ "name": "ESG (Environmental, Social, Governance)",
58
+ "description": "Sustainable and responsible investment screening",
59
+ "rules": [
60
+ {
61
+ "name": "Carbon Emissions Disclosure",
62
+ "threshold": "Required",
63
+ "description": "Company must disclose Scope 1 and 2 emissions"
64
+ },
65
+ {
66
+ "name": "Board Diversity",
67
+ "threshold": "> 30%",
68
+ "description": "At least 30% of board members should be women or minorities"
69
+ },
70
+ {
71
+ "name": "Environmental Violations",
72
+ "threshold": "None",
73
+ "description": "No major environmental fines or violations in past 2 years"
74
+ },
75
+ {
76
+ "name": "Labor Practices",
77
+ "threshold": "Compliant",
78
+ "description": "No labor rights violations or controversies"
79
+ }
80
+ ],
81
+ "analysis_prompt": """You are an ESG investment analyst. Analyze this financial document and extract the following:
82
+
83
+ 1. **Carbon Emissions**: Find Scope 1 and Scope 2 emissions disclosures.
84
+ 2. **Board Diversity**: Find percentage of women or minorities on the board.
85
+ 3. **Environmental Violations**: Check for environmental fines or legal issues.
86
+ 4. **Labor Practices**: Look for labor controversies or violations.
87
+
88
+ For EACH finding, you MUST provide:
89
+ - The specific data point
90
+ - The page number where you found it
91
+ - The specific section name (e.g., "Sustainability Report, page 15")
92
+
93
+ Format your response as JSON:
94
+ {
95
+ "carbon_disclosure": {"disclosed": true/false, "scope1": "X tons", "scope2": "Y tons", "page": N, "location": "Section name", "pass": true/false},
96
+ "board_diversity": {"percentage": "X%", "details": "...", "page": N, "location": "Section name", "pass": true/false},
97
+ "environmental_violations": {"found": true/false, "details": "...", "page": N, "location": "Section name", "pass": true/false},
98
+ "labor_practices": {"compliant": true/false, "details": "...", "page": N, "location": "Section name", "pass": true/false},
99
+ "overall_pass": true/false,
100
+ "summary": "Brief explanation"
101
+ }
102
+
103
+ If you cannot find specific information, state "Not found in document" but still cite where you looked."""
104
+ }
105
+
106
+ VALUE_INVESTING_CRITERIA = {
107
+ "name": "Value Investing",
108
+ "description": "Traditional value investing metrics",
109
+ "rules": [
110
+ {
111
+ "name": "P/E Ratio",
112
+ "threshold": "< 15",
113
+ "description": "Price to Earnings ratio should be below 15"
114
+ },
115
+ {
116
+ "name": "Debt to Equity",
117
+ "threshold": "< 0.5",
118
+ "description": "Debt to Equity ratio should be below 0.5"
119
+ },
120
+ {
121
+ "name": "Free Cash Flow",
122
+ "threshold": "Positive",
123
+ "description": "Company must have positive free cash flow"
124
+ },
125
+ {
126
+ "name": "Revenue Growth",
127
+ "threshold": "> 5%",
128
+ "description": "Year-over-year revenue growth should exceed 5%"
129
+ }
130
+ ],
131
+ "analysis_prompt": """You are a value investing analyst. Analyze this financial document and extract the following:
132
+
133
+ 1. **P/E Ratio**: Calculate or find Price to Earnings ratio. Should be < 15.
134
+ 2. **Debt to Equity**: Calculate total debt / total equity. Should be < 0.5.
135
+ 3. **Free Cash Flow**: Find operating cash flow minus capital expenditures. Must be positive.
136
+ 4. **Revenue Growth**: Calculate year-over-year revenue growth. Should be > 5%.
137
+
138
+ For EACH finding, you MUST provide:
139
+ - The exact value or ratio
140
+ - The page number where you found it
141
+ - The specific section or table name
142
+
143
+ Format your response as JSON:
144
+ {
145
+ "pe_ratio": {"value": X, "page": N, "location": "Section name", "pass": true/false},
146
+ "debt_to_equity": {"value": X, "page": N, "location": "Section name", "pass": true/false},
147
+ "free_cash_flow": {"value": "$X", "positive": true/false, "page": N, "location": "Section name", "pass": true/false},
148
+ "revenue_growth": {"value": "X%", "page": N, "location": "Section name", "pass": true/false},
149
+ "overall_pass": true/false,
150
+ "summary": "Brief explanation"
151
+ }
152
+
153
+ If you cannot find specific information, state "Not found in document" but still cite where you looked."""
154
+ }
155
+
156
+ CRITERIA_OPTIONS = {
157
+ "Shariah Compliance": SHARIAH_CRITERIA,
158
+ "ESG Screening": ESG_CRITERIA,
159
+ "Value Investing": VALUE_INVESTING_CRITERIA
160
+ }
document_processor.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document processing with LlamaIndex.
3
+ Handles PDF parsing, indexing, and querying with citation tracking.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ from typing import Dict, Any, List
9
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
10
+ from llama_index.llms.openai import OpenAI
11
+ from llama_index.core.node_parser import SimpleNodeParser
12
+ from llama_index.core.schema import NodeWithScore
13
+
14
+
15
+ class InvestmentDocumentProcessor:
16
+ """Process investment documents (PDFs) and extract information with citations."""
17
+
18
+ def __init__(self, api_key: str):
19
+ """Initialize the processor with OpenAI API key."""
20
+ # Configure OpenAI GPT-4o-mini (cheap and fast)
21
+ self.llm = OpenAI(
22
+ model="gpt-4o-mini",
23
+ api_key=api_key,
24
+ temperature=0.1 # Low temperature for factual extraction
25
+ )
26
+
27
+ # Set global LLM (embeddings will use OpenAI default)
28
+ Settings.llm = self.llm
29
+
30
+ # Node parser to chunk documents while preserving metadata
31
+ self.node_parser = SimpleNodeParser.from_defaults(
32
+ chunk_size=1024,
33
+ chunk_overlap=200
34
+ )
35
+
36
+ self.index = None
37
+ self.documents = None
38
+
39
+ def load_pdf(self, pdf_path: str) -> None:
40
+ """Load and index a PDF document."""
41
+ # Load PDF with metadata extraction
42
+ reader = SimpleDirectoryReader(
43
+ input_files=[pdf_path],
44
+ filename_as_id=True
45
+ )
46
+
47
+ self.documents = reader.load_data()
48
+
49
+ # Add page numbers to metadata if not present
50
+ for doc in self.documents:
51
+ if 'page_label' not in doc.metadata:
52
+ # SimpleDirectoryReader should add page info, but fallback
53
+ doc.metadata['page_label'] = doc.metadata.get('page', 'Unknown')
54
+
55
+ # Create vector index
56
+ self.index = VectorStoreIndex.from_documents(
57
+ self.documents,
58
+ node_parser=self.node_parser,
59
+ show_progress=True
60
+ )
61
+
62
+ def analyze_with_criteria(self, criteria_prompt: str) -> Dict[str, Any]:
63
+ """
64
+ Analyze the document against investment criteria.
65
+ Returns analysis with citations.
66
+ """
67
+ if self.index is None:
68
+ raise ValueError("No document loaded. Call load_pdf() first.")
69
+
70
+ # Create query engine with citation tracking
71
+ query_engine = self.index.as_query_engine(
72
+ similarity_top_k=10, # Get more context
73
+ response_mode="tree_summarize"
74
+ )
75
+
76
+ # Query with the criteria prompt
77
+ response = query_engine.query(criteria_prompt)
78
+
79
+ # Extract citations from source nodes
80
+ citations = self._extract_citations(response.source_nodes)
81
+
82
+ # Parse the response (expecting JSON)
83
+ try:
84
+ analysis_result = json.loads(str(response))
85
+ except json.JSONDecodeError:
86
+ # If not JSON, wrap in a structure
87
+ analysis_result = {
88
+ "raw_response": str(response),
89
+ "parse_error": True
90
+ }
91
+
92
+ # Add citations
93
+ analysis_result['citations'] = citations
94
+ analysis_result['source_nodes_count'] = len(response.source_nodes)
95
+
96
+ return analysis_result
97
+
98
+ def _extract_citations(self, source_nodes: List[NodeWithScore]) -> List[Dict[str, Any]]:
99
+ """Extract citation information from source nodes."""
100
+ citations = []
101
+
102
+ for idx, node in enumerate(source_nodes):
103
+ page = node.node.metadata.get('page_label',
104
+ node.node.metadata.get('page', 'Unknown'))
105
+
106
+ citation = {
107
+ "index": idx + 1,
108
+ "page": page,
109
+ "score": node.score,
110
+ "text_preview": node.node.text[:200] + "..." if len(node.node.text) > 200 else node.node.text,
111
+ "file_name": node.node.metadata.get('file_name', 'Unknown')
112
+ }
113
+ citations.append(citation)
114
+
115
+ return citations
116
+
117
+ def get_document_summary(self) -> Dict[str, Any]:
118
+ """Get basic document information."""
119
+ if self.documents is None:
120
+ return {"error": "No document loaded"}
121
+
122
+ return {
123
+ "num_pages": len(self.documents),
124
+ "file_name": self.documents[0].metadata.get('file_name', 'Unknown'),
125
+ "total_chars": sum(len(doc.text) for doc in self.documents)
126
+ }
127
+
128
+ def quick_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
129
+ """
130
+ Perform a quick search in the document.
131
+ Useful for finding specific sections or terms.
132
+ """
133
+ if self.index is None:
134
+ raise ValueError("No document loaded. Call load_pdf() first.")
135
+
136
+ query_engine = self.index.as_query_engine(
137
+ similarity_top_k=top_k,
138
+ response_mode="no_text" # Just return nodes, no generation
139
+ )
140
+
141
+ response = query_engine.query(query)
142
+
143
+ results = []
144
+ for node in response.source_nodes:
145
+ page = node.node.metadata.get('page_label',
146
+ node.node.metadata.get('page', 'Unknown'))
147
+ results.append({
148
+ "page": page,
149
+ "text": node.node.text,
150
+ "score": node.score
151
+ })
152
+
153
+ return results
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit>=1.31.0
2
+ llama-index>=0.10.0
3
+ openai>=1.0.0
4
+ pypdf>=4.0.0
5
+ python-dotenv>=1.0.0