hskwon7 commited on
Commit
6352c7d
·
verified ·
1 Parent(s): 8f01031

Update modules.py

Browse files
Files changed (1) hide show
  1. modules.py +739 -75
modules.py CHANGED
@@ -1,78 +1,742 @@
1
- # modules.py
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- """
4
- modules.py
5
 
6
- Helper functions for the Image-to-Story Streamlit application.
7
- Provides:
8
- - Cached loaders for Hugging Face pipelines (captioning, story generation)
9
- - Inference functions: generate_caption, generate_story_simple, generate_audio
10
- """
11
- import streamlit as st
12
- import re
13
- from transformers import pipeline
14
- from gtts import gTTS
15
- import io
16
-
17
- @st.cache_resource
18
- def load_captioner():
19
- """Load and cache BLIP image captioning pipeline."""
20
- return pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
21
-
22
- @st.cache_resource
23
- def load_story_gen():
24
- """Load and cache the genre-story-generator-v2 text-generation pipeline."""
25
- return pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")
26
-
27
- def trim_to_sentence(text: str, max_words: int) -> str:
28
- """
29
- Trim the story to the last complete sentence under max_words.
30
- If no sentence fits, fallback to the first max_words words.
31
- """
32
- sentences = re.split(r'(?<=[.!?])\s+', text)
33
- trimmed = []
34
- count = 0
35
- for s in sentences:
36
- wc = len(s.split())
37
- if count + wc <= max_words:
38
- trimmed.append(s)
39
- count += wc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  else:
41
- break
42
- if trimmed:
43
- return " ".join(trimmed)
44
- # fallback to naive word trim
45
- return " ".join(text.split()[:max_words])
46
-
47
- def generate_caption(captioner, image) -> str:
48
- """Run the captioner pipeline on the PIL image."""
49
- raw = captioner(image)
50
- first = raw[0]
51
- return first.get("generated_text", "") if isinstance(first, dict) else str(first)
52
-
53
- def generate_story_simple(storyteller, prompt_text: str,
54
- min_words: int = 50, max_words: int = 100) -> str:
55
- """
56
- Generate a 50–100 word story:
57
- 1. Sample ~120 tokens with nucleus sampling.
58
- 2. If under min_words, re-sample ~200 tokens with higher top_p.
59
- 3. Trim to last sentence under max_words.
60
- """
61
- out = storyteller(prompt_text, max_new_tokens=120,
62
- do_sample=True, top_p=0.9, num_return_sequences=1)
63
- story = out[0]["generated_text"]
64
- if len(story.split()) < min_words:
65
- out = storyteller(prompt_text, max_new_tokens=200,
66
- do_sample=True, top_p=0.95, num_return_sequences=1)
67
- story = out[0]["generated_text"]
68
- return trim_to_sentence(story, max_words)
69
-
70
- def generate_audio(text: str) -> (bytes, str):
71
- """
72
- Convert text to MP3 bytes using gTTS.
73
- Returns (audio_bytes, mime_type) for use in st.audio(...).
74
- """
75
- tts = gTTS(text=text, lang="en")
76
- buf = io.BytesIO()
77
- tts.write_to_fp(buf)
78
- return buf.getvalue(), "audio/mp3"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import streamlit as st
4
+ # import matplotlib.pyplot as plt
5
+ # from matplotlib import rcParams
6
+ import numpy as np
7
+ # import yahooquery
8
+ import plotly.graph_objects as go
9
+ from datetime import datetime, timedelta
10
+ import pandas as pd
11
+ from yahooquery import Ticker
12
+ # import math
13
+ from typing import List
14
 
 
 
15
 
16
+ def load_etf_data():
17
+ df_etf_info_master = pd.read_csv('etf_general_info_enriched.csv').rename(columns={'ticker': 'Ticker'})
18
+ df_etf, avilable_tickers = set_etf_data(df_etf_info_master)
19
+ df_analyst_report = pd.read_csv('etf_analyst_report_full.csv')
20
+ df_annual_return_master = pd.read_csv('annual_return.csv').rename(columns={'ticker': 'Ticker'})
21
+ return df_etf, df_analyst_report, avilable_tickers, df_annual_return_master
22
+
23
+ def set_etf_data(df_src):
24
+ df = df_src[
25
+ (df_src['averageVolume'] > 1000) &
26
+ (df_src['exchangeCountry'] == 'United States')
27
+ ].dropna(subset=['categoryName'])
28
+ full_ticker_list = df['ticker'].unique().tolist()
29
+ valid_ticker_set = set(t.upper() for t in full_ticker_list)
30
+ return df, valid_ticker_set
31
+
32
+ # Build a ticker → doc_text lookup
33
+ def make_doc_text(row):
34
+ parts = []
35
+ # helper to append only if the value exists
36
+ def add(label, value):
37
+ if pd.notna(value) and str(value).strip():
38
+ parts.append(f"{label}: {value}" if label else str(value))
39
+
40
+ add(None, row.shortName)
41
+ add(None, row.longName)
42
+ add("Issuer", row.family)
43
+ add("Category", row.categoryName)
44
+ add("Type", row.legalType)
45
+ add("Position", row.positionType)
46
+ add("Tags", row.otherTags)
47
+ add("Return", row.return_rating_text)
48
+ add("Risk", row.risk_rating_text)
49
+ add("Expense Ratio", row.annualReportExpenseRatio_rating_text)
50
+ add("Dividend Yield", row.dividendYield_rating_text)
51
+ add(None, row.longBusinessSummary)
52
+ add("Holdings", row.holdingInformation)
53
+
54
+ # join with “. ” so each bit reads like a sentence
55
+ return ". ".join(parts)
56
+
57
+ # Helper: extract and filter ticker spans from tokens + labels
58
+ def extract_valid_tickers(tokens, labels, tokenizer, valid_set):
59
+ spans, cur = [], []
60
+ for tok, lab in zip(tokens, labels):
61
+ if lab == "B-TICKER":
62
+ if cur:
63
+ spans.append(cur)
64
+ cur = [tok]
65
+ elif lab == "I-TICKER" and cur:
66
+ cur.append(tok)
67
+ else:
68
+ if cur:
69
+ spans.append(cur)
70
+ cur = []
71
+ if cur:
72
+ spans.append(cur)
73
+
74
+ results = []
75
+ for span in spans:
76
+ word = tokenizer.convert_tokens_to_string(span).strip().upper()
77
+ if word in valid_set:
78
+ results.append(word)
79
+ return results
80
+
81
+ # Rule-based fallback: catch literal 2–4 char tickers in the text
82
+ def rule_fallback(query, valid_set):
83
+ words = re.findall(r"\b[A-Za-z0-9]{2,4}\b", query)
84
+ return {w.upper() for w in words if w.upper() in valid_set}
85
+
86
+ def get_cols_to_display() -> List[str]:
87
+ """
88
+ Returns the list of raw Neo4j ETF property names that we want to select
89
+ for our recommendations table.
90
+ """
91
+ return [
92
+ 'ticker',
93
+ 'annualReportExpenseRatio',
94
+ 'previousCloseUSD',
95
+ 'averageVolumeUSD',
96
+ 'totalAssetsUSD',
97
+ 'longName',
98
+ 'marketCapUSD',
99
+ 'dividendYield',
100
+ 'ytdReturn',
101
+ 'oneMonthReturn',
102
+ 'threeMonthReturn',
103
+ 'oneYearReturn',
104
+ 'threeYearReturn',
105
+ 'fiveYearReturn',
106
+ 'tenYearReturn',
107
+ 'avg_annual_return',
108
+ 'return_rating',
109
+ 'risk_rating',
110
+ 'positionType',
111
+ 'isLeveraged',
112
+ 'return_rating_text',
113
+ 'risk_rating_text',
114
+ 'annualReportExpenseRatio_rating_text',
115
+ 'dividendYield_rating_text',
116
+ 'ytdReturn_rating_text',
117
+ 'oneMonthReturn_rating_text',
118
+ 'threeMonthReturn_rating_text',
119
+ 'oneYearReturn_rating_text',
120
+ 'threeYearReturn_rating_text',
121
+ 'fiveYearReturn_rating_text',
122
+ 'tenYearReturn_rating_text',
123
+ ]
124
+
125
+ def rename_etf_columns(df: pd.DataFrame) -> pd.DataFrame:
126
+ """
127
+ Rename DataFrame columns from raw names to display-friendly names.
128
+ """
129
+ mapping = {
130
+ 'ticker': 'Ticker',
131
+ 'annualReportExpenseRatio': 'Expense Ratio',
132
+ 'previousCloseUSD': 'Prev. Close',
133
+ 'averageVolumeUSD': 'Avg. Volume',
134
+ 'totalAssetsUSD': 'Total Assets',
135
+ 'longName': 'Full Name',
136
+ 'marketCapUSD': 'Market Cap.',
137
+ 'dividendYield': 'Dividend Yield',
138
+ 'ytdReturn': 'YTD Return',
139
+ 'oneMonthReturn': '1-month Return',
140
+ 'threeMonthReturn': '3-month Return',
141
+ 'oneYearReturn': '1-year Return',
142
+ 'threeYearReturn': '3-year Return',
143
+ 'fiveYearReturn': '5-year Return',
144
+ 'tenYearReturn': '10-year Return',
145
+ 'avg_annual_return': 'Avg. Annual Return %',
146
+ 'return_rating': 'Avg. Return Rating (1-10)',
147
+ 'risk_rating': 'Avg. Risk Rating (1-10)',
148
+ 'positionType': 'Position Type',
149
+ 'isLeveraged': 'Leveraged',
150
+ 'return_rating_text': 'Return Rating',
151
+ 'risk_rating_text': 'Risk Rating',
152
+ 'annualReportExpenseRatio_rating_text': 'Expense Ratio Rating',
153
+ 'dividendYield_rating_text': 'Dividend Yield Rating',
154
+ 'ytdReturn_rating_text': 'YTD Return Rating',
155
+ 'oneMonthReturn_rating_text': '1-month Return Rating',
156
+ 'threeMonthReturn_rating_text': '3-month Return Rating',
157
+ 'oneYearReturn_rating_text': '1-year Return Rating',
158
+ 'threeYearReturn_rating_text': '3-year Return Rating',
159
+ 'fiveYearReturn_rating_text': '5-year Return Rating',
160
+ 'tenYearReturn_rating_text': '10-year Return Rating',
161
+ }
162
+
163
+ # Only rename columns that actually exist in df
164
+ valid_mapping = {k: v for k, v in mapping.items() if k in df.columns}
165
+ return df.rename(columns=valid_mapping)
166
+
167
+
168
+ def get_etf_recommendations_from_list(
169
+ list_of_fetched_etfs: List[str],
170
+ df_etf: pd.DataFrame,
171
+ top_n: int
172
+ ) -> pd.DataFrame:
173
+ """
174
+ Filter the master ETF DataFrame down to the tickers you fetched,
175
+ sort by averageVolumeUSD descending, take the top_n rows,
176
+ select only the requested raw columns, rename them for display, and return.
177
+
178
+ Parameters
179
+ ----------
180
+ list_of_fetched_etfs : List[str]
181
+ ETF ticker symbols returned by your semantic search.
182
+ df_etf : pd.DataFrame
183
+ The full ETF DataFrame loaded from Neo4j, with raw property names.
184
+ top_n : int
185
+ How many of the highest-volume ETFs to return.
186
+
187
+ Returns
188
+ -------
189
+ pd.DataFrame
190
+ A DataFrame of the top_n ETFs (by avg volume), with only the
191
+ selected columns, renamed to friendly display names.
192
+ """
193
+ # 1. Keep only the tickers you fetched
194
+ df_filtered = df_etf[df_etf['ticker'].isin(list_of_fetched_etfs)].copy()
195
+
196
+ # 2. Sort by raw averageVolumeUSD descending
197
+ df_sorted = df_filtered.sort_values(by='averageVolumeUSD', ascending=False)
198
+
199
+ # 3. Take the top_n rows
200
+ df_top = df_sorted.head(top_n)
201
+
202
+ # 4. Select only the columns you asked for
203
+ df_selected = df_top[get_cols_to_display()]
204
+
205
+ # 5. Rename to friendly display names
206
+ df_final = rename_etf_columns(df_selected)
207
+
208
+ return df_final
209
+
210
+
211
+ def format_number_short(x):
212
+ """
213
+ Converts a single number to a short format with K (thousands), M (millions),
214
+ B (billions), or T (trillions) suffix. Preserves NaN values.
215
+
216
+ Parameters:
217
+ x (float or int): The number to format.
218
+
219
+ Returns:
220
+ str or float: The formatted string if x is a number, or the original NaN.
221
+ """
222
+ # If the value is NaN, return it as is
223
+ if pd.isna(x):
224
+ return x
225
+
226
+ # Use the absolute value for comparison to handle negative numbers
227
+ abs_x = abs(x)
228
+
229
+ if abs_x < 1e3:
230
+ # For values less than 1,000, just return the value formatted to two decimals.
231
+ return f"{x:.2f}"
232
+ elif abs_x < 1e6:
233
+ # For thousands, divide by 1,000 and append 'K'
234
+ return f"{x/1e3:.2f}K"
235
+ elif abs_x < 1e9:
236
+ # For millions, divide by 1,000,000 and append 'M'
237
+ return f"{x/1e6:.2f}M"
238
+ elif abs_x < 1e12:
239
+ # For billions, divide by 1,000,000,000 and append 'B'
240
+ return f"{x/1e9:.2f}B"
241
+ else:
242
+ # For trillions and above, divide by 1,000,000,000,000 and append 'T'
243
+ return f"{x/1e12:.2f}T"
244
+
245
+ def transform_number_columns(df, columns):
246
+ """
247
+ Transforms specified numeric columns in a DataFrame to short format strings.
248
+ The transformation converts numbers to their respective short formats:
249
+ thousands (K), millions (M), billions (B), and trillions (T).
250
+ NaN values are preserved.
251
+
252
+ Parameters:
253
+ df (pd.DataFrame): The input DataFrame.
254
+ columns (list): List of column names (as strings) to be transformed.
255
+
256
+ Returns:
257
+ pd.DataFrame: A copy of the DataFrame with the specified columns transformed.
258
+ """
259
+ # Create a copy of the DataFrame to avoid modifying the original
260
+ df_transformed = df.copy()
261
+
262
+ # Loop through each specified column
263
+ for col in columns:
264
+ if col in df_transformed.columns:
265
+ # Apply the formatting function to each value in the column.
266
+ df_transformed[col] = df_transformed[col].apply(format_number_short)
267
+
268
+ return df_transformed
269
+
270
+ def transform_float_columns_to_perc(df, columns):
271
+ """
272
+ Transforms specified numeric columns in a DataFrame to short format strings.
273
+ The transformation converts numbers to their respective short formats:
274
+ thousands (K), millions (M), billions (B), and trillions (T).
275
+ NaN values are preserved.
276
+
277
+ Parameters:
278
+ df (pd.DataFrame): The input DataFrame.
279
+ columns (list): List of column names (as strings) to be transformed.
280
+
281
+ Returns:
282
+ pd.DataFrame: A copy of the DataFrame with the specified columns transformed.
283
+ """
284
+ # Create a copy of the DataFrame to avoid modifying the original
285
+ df_transformed = df.copy()
286
+
287
+ # Loop through each specified column
288
+ for col in columns:
289
+ if col in df_transformed.columns:
290
+ # Apply transformation: multiply by 100, format as string, preserve NaNs
291
+ df_transformed[col] = df_transformed[col].apply(
292
+ lambda x: f"{x * 100:.2f}%" if pd.notna(x) else x
293
+ )
294
+
295
+ return df_transformed
296
+
297
+ def overview_df(df_recommendations, drop_relavance_score=True):
298
+ overview_cols = ["Leveraged", "Ticker", "Full Name", 'Category', 'Country', 'Total Assets', "Prev. Close",
299
+ "Avg. Volume", 'Market Cap.', "Relevance Score"]
300
+ existing_cols = [col for col in overview_cols if col in df_recommendations.columns]
301
+ df_overview = transform_number_columns(df_recommendations[existing_cols], ['Total Assets', 'Market Cap.'])
302
+ df_overview = transform_float_columns_to_perc(df_overview, columns=['Relevance Score'])
303
+ if drop_relavance_score:
304
+ df_overview = df_overview.drop(['Relevance Score'], axis=1)
305
+ return df_overview
306
+
307
+ def transform_return_columns(df, cols=None):
308
+ """
309
+ Transforms float values to percentage strings for all columns ending with 'Return'.
310
+
311
+ For each column in the DataFrame whose name ends with 'Return', the function
312
+ multiplies each non-NaN float value by 100 and formats it as a string with two
313
+ decimal places followed by a percent sign. NaN values are preserved.
314
+
315
+ Parameters:
316
+ df (pd.DataFrame): The input DataFrame.
317
+
318
+ Returns:
319
+ pd.DataFrame: A copy of the DataFrame with transformed 'Return' columns.
320
+ """
321
+ # Create a copy of the DataFrame to avoid modifying the original
322
+ df_transformed = df.copy()
323
+
324
+ # Loop through each column in the DataFrame
325
+ for col in df_transformed.columns:
326
+ # Check if the column name ends with 'Return'
327
+ if col.endswith('Return'):
328
+ # Apply transformation: multiply by 100, format as string, preserve NaNs
329
+ df_transformed[col] = df_transformed[col].apply(
330
+ lambda x: f"{x * 100:.2f}%" if pd.notna(x) else x
331
+ )
332
+
333
+ return df_transformed
334
+
335
+ def return_df(df_recommendations):
336
+ # Returns
337
+ returns_cols = [
338
+ "Ticker", "Full Name", 'Category', "YTD Return", "1-month Return",
339
+ "3-month Return", "1-year Return", "3-year Return",
340
+ "5-year Return", "10-year Return"
341
+ ]
342
+ existing_cols = [col for col in returns_cols if col in df_recommendations.columns]
343
+ df_return = transform_return_columns(df_recommendations[existing_cols])
344
+ return df_return
345
+
346
+
347
+ def clean_ratings_columns(df):
348
+ rating_cols = ['YTD Return Rating', '1-month Return Rating', '3-month Return Rating',
349
+ '1-year Return Rating', '3-year Return Rating', '5-year Return Rating', '10-year Return Rating',
350
+ 'Expense Ratio Rating', 'Dividend Yield Rating']
351
+ strings_to_keep = ['High', 'Moderate', 'Low']
352
+
353
+ for col in rating_cols:
354
+ if col in df.columns:
355
+ df.loc[:, col] = df[col].copy().astype(str).apply(
356
+ lambda x: next((s for s in strings_to_keep if s in x), '').strip()
357
+ )
358
+ return df
359
+
360
+ def rating_df(df_recommendations):
361
+ ratings_cols = [
362
+ "Ticker", "Full Name", 'Category', "Avg. Return Rating (1-10)", "Avg. Risk Rating (1-10)",
363
+ 'Avg. Return Rating', 'Avg. Risk Rating', 'YTD Return Rating', '1-month Return Rating', '3-month Return Rating',
364
+ '1-year Return Rating', '3-year Return Rating', '5-year Return Rating', '10-year Return Rating'
365
+ ]
366
+ existing_cols = [col for col in ratings_cols if col in df_recommendations.columns]
367
+ df_rating = clean_ratings_columns(df_recommendations[existing_cols])
368
+
369
+ return df_rating
370
+
371
+ def expense_ratio_df(df_recommendations):
372
+ expenses_cols = ["Ticker", "Full Name", "Category", 'Total Assets', 'Expense Ratio', 'Expense Ratio Rating']
373
+ existing_cols = [col for col in expenses_cols if col in df_recommendations.columns]
374
+ df_rec_transformed = transform_number_columns(df_recommendations[existing_cols], ['Total Assets'])
375
+ df_rec_transformed = transform_float_columns_to_perc(df_rec_transformed, columns=['Expense Ratio'])
376
+ df_rec_transformed = clean_ratings_columns(df_rec_transformed)
377
+ return df_rec_transformed
378
+
379
+ def holdings_df(df_recommendations):
380
+ holdings_cols = ["Ticker", "Full Name", "Category", "Holdings"]
381
+ existing_cols = [col for col in holdings_cols if col in df_recommendations.columns]
382
+ return df_recommendations[existing_cols]
383
+
384
+ def dividend_df(df_recommendations):
385
+ dividends_cols = ["Ticker", "Full Name", "Category", "Dividend Yield", "Dividend Yield Rating"]
386
+ existing_cols = [col for col in dividends_cols if col in df_recommendations.columns]
387
+ df_rec_transformed = clean_ratings_columns(df_recommendations[existing_cols])
388
+ df_rec_transformed = transform_float_columns_to_perc(df_rec_transformed, columns=['Dividend Yield'])
389
+ return df_rec_transformed
390
+
391
+ def display_matching_etfs(df_recommendations):
392
+ if not df_recommendations.empty:
393
+ # st.write("Below are the **most recent ETF recommendations** we found:")
394
+ # Create tabs for each column group
395
+ tabs = st.tabs(["Overview", "Returns", "Ratings", 'Holdings', 'Expenses', 'Dividends'])
396
+
397
+ # Overview
398
+ with tabs[0]:
399
+ st.dataframe(overview_df(df_recommendations), hide_index=True)
400
+
401
+ # Returns
402
+ with tabs[1]:
403
+ st.dataframe(return_df(df_recommendations), hide_index=True)
404
+
405
+ # Ratings
406
+ with tabs[2]:
407
+ st.dataframe(rating_df(df_recommendations), hide_index=True)
408
+
409
+ # Holdings
410
+ with tabs[3]:
411
+ st.dataframe(holdings_df(df_recommendations), hide_index=True)
412
+
413
+ # Expenses
414
+ with tabs[4]:
415
+ st.dataframe(expense_ratio_df(df_recommendations), hide_index=True)
416
+
417
+ # Dividend
418
+ with tabs[5]:
419
+ st.dataframe(dividend_df(df_recommendations), hide_index=True)
420
+
421
+ return
422
+
423
+ def compare_etfs_interactive(etf_a, etf_b):
424
+ """
425
+ Fetches 5-year historical price data for two ETFs from Yahoo Finance,
426
+ calculates percentage change from the starting price, and returns a Plotly
427
+ figure for interactive viewing in Streamlit.
428
+
429
+ Parameters:
430
+ etf_a (str): Ticker symbol for the first ETF.
431
+ etf_b (str): Ticker symbol for the second ETF.
432
+
433
+ Returns:
434
+ plotly.graph_objects.Figure: Interactive Plotly figure comparing the two ETFs.
435
+ """
436
+ end_date = datetime.today()
437
+ start_date = end_date - timedelta(days=5 * 365)
438
+
439
+ # Fetch historical data for both ETFs using yahooquery.
440
+ tickers = Ticker(f'{etf_a} {etf_b}', asynchronous=True)
441
+ df = tickers.history(period='5y', interval='1d').reset_index()
442
+
443
+ # Filter data for each symbol
444
+ df_a = df[df.symbol == etf_a].rename(columns={'adjclose': 'Adj Close A'})[['date', 'Adj Close A']]
445
+ df_b = df[df.symbol == etf_b].rename(columns={'adjclose': 'Adj Close B'})[['date', 'Adj Close B']]
446
+
447
+ # Merge on date
448
+ df_merged = pd.merge(df_a, df_b, on='date', how='inner').set_index('date')
449
+
450
+ # Calculate percentage change from the first available adjusted close
451
+ df_merged['Pct Change A'] = (df_merged['Adj Close A'] / df_merged['Adj Close A'].iloc[0] - 1) * 100
452
+ df_merged['Pct Change B'] = (df_merged['Adj Close B'] / df_merged['Adj Close B'].iloc[0] - 1) * 100
453
+
454
+ # Create a Plotly figure
455
+ fig = go.Figure()
456
+ fig.add_trace(
457
+ go.Scatter(
458
+ x=df_merged.index,
459
+ y=df_merged['Pct Change A'],
460
+ mode='lines',
461
+ name=etf_a
462
+ )
463
+ )
464
+ fig.add_trace(
465
+ go.Scatter(
466
+ x=df_merged.index,
467
+ y=df_merged['Pct Change B'],
468
+ mode='lines',
469
+ name=etf_b
470
+ )
471
+ )
472
+
473
+ # Customize layout
474
+ fig.update_layout(
475
+ title=f'5-Year Performance Comparison: {etf_a} vs. {etf_b}',
476
+ xaxis_title='Date',
477
+ yaxis_title='Percentage Change (%)',
478
+ hovermode='x unified'
479
+ )
480
+ fig.update_xaxes(range=[start_date, end_date])
481
+
482
+ return fig
483
+
484
+ def clean_etf_text(text: str) -> str:
485
+ """
486
+ Cleans and formats OCR-parsed ETF text by:
487
+ - Removing excessive newlines and spaces
488
+ - Fixing line-break hyphenations
489
+ - Normalizing whitespace and punctuation
490
+ - Removing wrapping quotes
491
+ """
492
+ # Remove leading/trailing whitespace and outer quotes if present
493
+ text = text.strip().strip('"').strip("'")
494
+
495
+ # Fix hyphenated line breaks (e.g., 'NASDAQ-\n100' -> 'NASDAQ-100')
496
+ text = re.sub(r'-\s*\n\s*', '-', text)
497
+
498
+ # Replace remaining line breaks with spaces
499
+ text = re.sub(r'[\n\r]+', ' ', text)
500
+
501
+ # Remove excessive spaces
502
+ text = re.sub(r'\s{2,}', ' ', text)
503
+
504
+ # Ensure proper spacing after periods, commas, etc.
505
+ text = re.sub(r'([.,!?])([^\s])', r'\1 \2', text)
506
+
507
+ # Capitalize the first letter if needed
508
+ if text and text[0].islower():
509
+ text = text[0].upper() + text[1:]
510
+
511
+ return text.strip()
512
+
513
+ def trim_to_last_full_sentence(text: str) -> str:
514
+ # If it already ends cleanly, just return it
515
+ if text.rstrip().endswith(('.', '!', '?')):
516
+ return text
517
+
518
+ # Split on sentence boundaries: punctuation + space + uppercase
519
+ pattern = r'(?<=[.!?])\s+(?=[A-Z])'
520
+ parts = re.split(pattern, text)
521
+
522
+ # If we only got one part, nothing to trim
523
+ if len(parts) == 1:
524
+ return text
525
+
526
+ # Drop the last (incomplete) fragment and rejoin the rest
527
+ full_sentences = parts[:-1]
528
+ return ' '.join(full_sentences).strip()
529
+
530
+ def lookup_etf_report(ticker, df_analyst_report):
531
+ return
532
+
533
+ def form_display_comparison_table(df_etf, list_of_parsed_tickers):
534
+ cols_interests = ['Ticker', 'longName', 'categoryName', 'previousCloseUSD', 'averageVolumeUSD', 'return_rating', 'risk_rating',
535
+ 'ytdReturn', 'oneMonthReturn', 'threeMonthReturn', 'oneYearReturn', 'threeYearReturn', 'fiveYearReturn',
536
+ 'tenYearReturn', 'annualReportExpenseRatio']
537
+
538
+ cols_interests_pretty = ['Ticker', 'Full Name', 'Category', 'Prev. Close', 'Avg. Volume', 'Return Rating (1-10)', 'Risk Rating (1-10)',
539
+ 'YTD Return', '1-month Return', '3-month Return', '1-year Return', '3-year Return',
540
+ '5-year Return', '10-year Return', 'Expense Ratio']
541
+
542
+ rename_dict = dict(zip(cols_interests, cols_interests_pretty))
543
+
544
+ df_comparison = df_etf[df_etf['Ticker'].isin(list_of_parsed_tickers)][cols_interests]
545
+ df_comparison = df_comparison.rename(columns=rename_dict)
546
+ df_comparison = transform_return_columns(df_comparison)
547
+ df_comparison = transform_float_columns_to_perc(df_comparison, columns=['Expense Ratio'])
548
+ df_comparison = transform_number_columns(df_comparison, ['Avg. Volume'])
549
+
550
+ return df_comparison
551
+
552
+ def portfolio_interactive_chart(df_port_output):
553
+ # Create a Plotly figure
554
+ fig = go.Figure()
555
+
556
+ # Plot each ETF's growth as a separate line
557
+ for col in df_port_output.columns:
558
+ if col not in ["year", "Total"]:
559
+ fig.add_trace(go.Scatter(
560
+ x=df_port_output["year"],
561
+ y=df_port_output[col],
562
+ mode='lines',
563
+ name=col
564
+ ))
565
+
566
+ # Plot the 'Total' line, perhaps in a different style
567
+ fig.add_trace(go.Scatter(
568
+ x=df_port_output["year"],
569
+ y=df_port_output["Total"],
570
+ mode='lines',
571
+ name="Total",
572
+ # line=dict(dash='dash', color='black')
573
+ line=dict(dash='dash')
574
+ ))
575
+
576
+ fig.update_layout(
577
+ title="Portfolio Growth Over Time",
578
+ xaxis_title="Year",
579
+ yaxis_title="Portfolio Value (USD)",
580
+ hovermode='x unified'
581
+ )
582
+
583
+ return fig
584
+
585
+ def set_estimated_return(tickers, df_general_info, df_annual_return):
586
+ """
587
+ Estimate the return for each ticker based on trailing and annual returns.
588
+
589
+ For each ticker, the function:
590
+ 1. Extracts trailing return data from df_general_info for the columns:
591
+ 'oneYearReturn', 'threeYearReturn', 'fiveYearReturn', and 'tenYearReturn'.
592
+ 2. Replaces NaN values with 0 and calculates the mean of non-zero trailing returns.
593
+ 3. Retrieves the average annual return from df_annual_return using the 'fundReturn' column.
594
+ If 'fundReturn' is NaN, it attempts to use the 'categoryReturn' column instead.
595
+ 4. Uses the non-zero mean trailing return if available; otherwise, falls back to the annual return.
596
+
597
+ Parameters:
598
+ tickers (iterable): An iterable of ticker symbols to process.
599
+ df_general_info (pd.DataFrame): DataFrame containing general information including trailing returns.
600
+ df_annual_return (pd.DataFrame): DataFrame containing annual return information.
601
+
602
+ Returns:
603
+ dict: A dictionary mapping each ticker to its estimated return.
604
+ """
605
+
606
+ # Define the columns that contain the trailing returns in the general info DataFrame.
607
+ trailing_returns_cols = ['oneYearReturn', 'threeYearReturn', 'fiveYearReturn', 'tenYearReturn']
608
+
609
+ # Define the column names for annual return and category-based annual return.
610
+ annual_return_col = 'fundReturn'
611
+ cat_annual_return_col = 'categoryReturn'
612
+
613
+ # Dictionary to store the estimated return for each ticker.
614
+ # d_est_return = {}
615
+
616
+ ticker_collected = []
617
+ est_return_collected = []
618
+
619
+ # Loop over each ticker symbol provided in the tickers list.
620
+ for ticker in tickers:
621
+ # Extract the trailing return values for the current ticker.
622
+ trailing_return = df_general_info[df_general_info['Ticker'] == ticker][trailing_returns_cols].values
623
+
624
+ # Replace any NaN values in the trailing return array with 0.
625
+ trailing_return = np.nan_to_num(trailing_return, nan=0)
626
+
627
+ # Filter out zero values to only consider nonzero trailing returns.
628
+ non_zero_elements = trailing_return[trailing_return != 0]
629
+
630
+ # Calculate the mean of the nonzero trailing returns, if available.
631
+ if len(non_zero_elements) > 0:
632
+ non_zero_mean_trailing_return = np.mean(non_zero_elements)
633
  else:
634
+ non_zero_mean_trailing_return = 0
635
+
636
+ # Calculate the average annual return from the annual return DataFrame using 'fundReturn'.
637
+ avg_return = df_annual_return[df_annual_return['Ticker'] == ticker][annual_return_col].mean()
638
+
639
+ # If the annual return is NaN, try using the 'categoryReturn' column instead.
640
+ if pd.isnull(avg_return):
641
+ avg_return = df_annual_return[df_annual_return['Ticker'] == ticker][cat_annual_return_col].mean()
642
+ # If still NaN, default to 0.
643
+ if pd.isnull(avg_return):
644
+ avg_return = 0
645
+
646
+ # Choose the estimated return:
647
+ # If the nonzero trailing mean is 0, use the annual return (avg_return).
648
+ # Otherwise, use the nonzero trailing mean.
649
+ if non_zero_mean_trailing_return == 0:
650
+ est_return_collected.append(avg_return)
651
+ # d_est_return[ticker] = avg_return
652
+ else:
653
+ est_return_collected.append(non_zero_mean_trailing_return)
654
+ # d_est_return[ticker] = non_zero_mean_trailing_return
655
+
656
+ ticker_collected.append(ticker)
657
+
658
+ df = pd.DataFrame({'etf': ticker_collected, 'estimated_annual_return': est_return_collected})
659
+ d = df.to_dict()
660
+ return df, d
661
+
662
+ def form_d_chat_history(result_id, response, task, fig=None, df=None):
663
+ d = {
664
+ "id": result_id,
665
+ "task": task,
666
+ "response": response,
667
+ "fig": fig,
668
+ "df": df
669
+ }
670
+ return d
671
+
672
+ def portfolio_growth_over_time(df, target_years=30):
673
+ """
674
+ Calculate the portfolio value over time (yearly) for each asset in the DataFrame.
675
+ The DataFrame should have columns:
676
+ - 'etf'
677
+ - 'initial_investment'
678
+ - 'estimated_annual_return' (as percentage string like "10%" or as a decimal)
679
+ - 'amount_of_recurring_investments'
680
+
681
+ Parameters:
682
+ df (pd.DataFrame): Input DataFrame with asset details.
683
+ target_years (int): Total number of years to project (default is 30).
684
+
685
+ Returns:
686
+ portfolio_data (pd.DataFrame): DataFrame containing the portfolio value for each asset
687
+ and the total portfolio value over time.
688
+ """
689
+ years = np.arange(0, target_years + 1) # yearly intervals from 0 to target_years
690
+ portfolio_data = pd.DataFrame({'year': years})
691
+
692
+ # Process each asset separately
693
+ for idx, row in df.iterrows():
694
+ etf = row['etf']
695
+ P = row['initial_investment']
696
+ recurring = row['amount_of_recurring_investments']
697
+ r = row['estimated_annual_return']
698
+ # Convert percentage string (if applicable) to a decimal
699
+ if isinstance(r, str) and '%' in r:
700
+ r = float(r.strip('%')) / 100.0
701
+
702
+ monthly_rate = r / 12
703
+ values = []
704
+ for t in years:
705
+ months = int(t * 12)
706
+ # Future value from the initial investment:
707
+ fv_initial = P * (1 + monthly_rate) ** months
708
+ # Future value from monthly contributions (annuity formula)
709
+ if monthly_rate != 0:
710
+ fv_contrib = recurring * (((1 + monthly_rate) ** months - 1) / monthly_rate)
711
+ else:
712
+ fv_contrib = recurring * months
713
+ total_value = fv_initial + fv_contrib
714
+ values.append(total_value)
715
+ portfolio_data[etf] = values
716
+
717
+ # Compute total portfolio value (summing each asset)
718
+ asset_columns = df['etf'].tolist()
719
+ portfolio_data['Total'] = portfolio_data[asset_columns].sum(axis=1)
720
+
721
+ last_row = portfolio_data.iloc[-1].to_dict()
722
+
723
+ return portfolio_data, last_row
724
+
725
+ def run_portfolio_analysis(list_of_parsed_tickers, df_etf, df_annual_return_master):
726
+ # Portfolio Analysis configuration
727
+ target_years = 30
728
+ init_investment = 1000
729
+ recur_monthly = 100
730
+
731
+ df_port_input = pd.DataFrame({'etf': list_of_parsed_tickers,
732
+ 'initial_investment': [init_investment] * len(list_of_parsed_tickers),
733
+ 'amount_of_recurring_investments': [recur_monthly] * len(list_of_parsed_tickers)})
734
+
735
+ df_est_return, d_est_return = set_estimated_return(tickers=list_of_parsed_tickers,
736
+ df_general_info=df_etf,
737
+ df_annual_return=df_annual_return_master)
738
+
739
+ df_port_input = df_port_input.merge(df_est_return, how='left', on='etf').fillna(0)
740
+
741
+ df_port_output, d_summary = portfolio_growth_over_time(df=df_port_input, target_years=target_years)
742
+ return df_port_output