ISOM5240-Final-Assignment

Sleeping

App Files Files Community

hskwon7 commited on May 19, 2025

Commit

6352c7d

verified ·

1 Parent(s): 8f01031

Update modules.py

Browse files

Files changed (1) hide show

modules.py +739 -75

modules.py CHANGED Viewed

@@ -1,78 +1,742 @@
-# modules.py
-"""
-modules.py
-Helper functions for the Image-to-Story Streamlit application.
-Provides:
-- Cached loaders for Hugging Face pipelines (captioning, story generation)
-- Inference functions: generate_caption, generate_story_simple, generate_audio
-"""
-import streamlit as st
-import re
-from transformers import pipeline
-from gtts import gTTS
-import io
-@st.cache_resource
-def load_captioner():
-    """Load and cache BLIP image captioning pipeline."""
-    return pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-@st.cache_resource
-def load_story_gen():
-    """Load and cache the genre-story-generator-v2 text-generation pipeline."""
-    return pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")
-def trim_to_sentence(text: str, max_words: int) -> str:
-    """
-    Trim the story to the last complete sentence under max_words.
-    If no sentence fits, fallback to the first max_words words.
-    """
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    trimmed = []
-    count = 0
-    for s in sentences:
-        wc = len(s.split())
-        if count + wc <= max_words:
-            trimmed.append(s)
-            count += wc
         else:
-            break
-    if trimmed:
-        return " ".join(trimmed)
-    # fallback to naive word trim
-    return " ".join(text.split()[:max_words])
-def generate_caption(captioner, image) -> str:
-    """Run the captioner pipeline on the PIL image."""
-    raw = captioner(image)
-    first = raw[0]
-    return first.get("generated_text", "") if isinstance(first, dict) else str(first)
-def generate_story_simple(storyteller, prompt_text: str,
-                          min_words: int = 50, max_words: int = 100) -> str:
-    """
-    Generate a 50–100 word story:
-      1. Sample ~120 tokens with nucleus sampling.
-      2. If under min_words, re-sample ~200 tokens with higher top_p.
-      3. Trim to last sentence under max_words.
-    """
-    out = storyteller(prompt_text, max_new_tokens=120,
-                      do_sample=True, top_p=0.9, num_return_sequences=1)
-    story = out[0]["generated_text"]
-    if len(story.split()) < min_words:
-        out = storyteller(prompt_text, max_new_tokens=200,
-                          do_sample=True, top_p=0.95, num_return_sequences=1)
-        story = out[0]["generated_text"]
-    return trim_to_sentence(story, max_words)
-def generate_audio(text: str) -> (bytes, str):
-    """
-    Convert text to MP3 bytes using gTTS.
-    Returns (audio_bytes, mime_type) for use in st.audio(...).
-    """
-    tts = gTTS(text=text, lang="en")
-    buf = io.BytesIO()
-    tts.write_to_fp(buf)
-    return buf.getvalue(), "audio/mp3"

+import pandas as pd
+import re
+import streamlit as st
+# import matplotlib.pyplot as plt
+# from matplotlib import rcParams
+import numpy as np
+# import yahooquery
+import plotly.graph_objects as go
+from datetime import datetime, timedelta
+import pandas as pd
+from yahooquery import Ticker
+# import math
+from typing import List
+def load_etf_data():
+    df_etf_info_master = pd.read_csv('etf_general_info_enriched.csv').rename(columns={'ticker': 'Ticker'})
+    df_etf, avilable_tickers = set_etf_data(df_etf_info_master)
+    df_analyst_report = pd.read_csv('etf_analyst_report_full.csv')
+    df_annual_return_master = pd.read_csv('annual_return.csv').rename(columns={'ticker': 'Ticker'})
+    return df_etf, df_analyst_report, avilable_tickers, df_annual_return_master
+def set_etf_data(df_src):
+    df = df_src[
+    (df_src['averageVolume'] > 1000) &
+    (df_src['exchangeCountry'] == 'United States')
+    ].dropna(subset=['categoryName'])
+    full_ticker_list = df['ticker'].unique().tolist()
+    valid_ticker_set = set(t.upper() for t in full_ticker_list)
+    return df, valid_ticker_set
+# Build a ticker → doc_text lookup
+def make_doc_text(row):
+    parts = []
+    # helper to append only if the value exists
+    def add(label, value):
+        if pd.notna(value) and str(value).strip():
+            parts.append(f"{label}: {value}" if label else str(value))
+    add(None,                     row.shortName)
+    add(None,                     row.longName)
+    add("Issuer",                 row.family)
+    add("Category",               row.categoryName)
+    add("Type",                   row.legalType)
+    add("Position",               row.positionType)
+    add("Tags",                   row.otherTags)
+    add("Return",                 row.return_rating_text)
+    add("Risk",                   row.risk_rating_text)
+    add("Expense Ratio",          row.annualReportExpenseRatio_rating_text)
+    add("Dividend Yield",         row.dividendYield_rating_text)
+    add(None,                     row.longBusinessSummary)
+    add("Holdings",               row.holdingInformation)
+    # join with “. ” so each bit reads like a sentence
+    return ". ".join(parts)
+# Helper: extract and filter ticker spans from tokens + labels
+def extract_valid_tickers(tokens, labels, tokenizer, valid_set):
+    spans, cur = [], []
+    for tok, lab in zip(tokens, labels):
+        if lab == "B-TICKER":
+            if cur:
+                spans.append(cur)
+            cur = [tok]
+        elif lab == "I-TICKER" and cur:
+            cur.append(tok)
+        else:
+            if cur:
+                spans.append(cur)
+            cur = []
+    if cur:
+        spans.append(cur)
+    results = []
+    for span in spans:
+        word = tokenizer.convert_tokens_to_string(span).strip().upper()
+        if word in valid_set:
+            results.append(word)
+    return results
+# Rule-based fallback: catch literal 2–4 char tickers in the text
+def rule_fallback(query, valid_set):
+    words = re.findall(r"\b[A-Za-z0-9]{2,4}\b", query)
+    return {w.upper() for w in words if w.upper() in valid_set}
+def get_cols_to_display() -> List[str]:
+    """
+    Returns the list of raw Neo4j ETF property names that we want to select
+    for our recommendations table.
+    """
+    return [
+        'ticker',
+        'annualReportExpenseRatio',
+        'previousCloseUSD',
+        'averageVolumeUSD',
+        'totalAssetsUSD',
+        'longName',
+        'marketCapUSD',
+        'dividendYield',
+        'ytdReturn',
+        'oneMonthReturn',
+        'threeMonthReturn',
+        'oneYearReturn',
+        'threeYearReturn',
+        'fiveYearReturn',
+        'tenYearReturn',
+        'avg_annual_return',
+        'return_rating',
+        'risk_rating',
+        'positionType',
+        'isLeveraged',
+        'return_rating_text',
+        'risk_rating_text',
+        'annualReportExpenseRatio_rating_text',
+        'dividendYield_rating_text',
+        'ytdReturn_rating_text',
+        'oneMonthReturn_rating_text',
+        'threeMonthReturn_rating_text',
+        'oneYearReturn_rating_text',
+        'threeYearReturn_rating_text',
+        'fiveYearReturn_rating_text',
+        'tenYearReturn_rating_text',
+    ]
+def rename_etf_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Rename DataFrame columns from raw names to display-friendly names.
+    """
+    mapping = {
+        'ticker':                             'Ticker',
+        'annualReportExpenseRatio':          'Expense Ratio',
+        'previousCloseUSD':                  'Prev. Close',
+        'averageVolumeUSD':                  'Avg. Volume',
+        'totalAssetsUSD':                    'Total Assets',
+        'longName':                          'Full Name',
+        'marketCapUSD':                      'Market Cap.',
+        'dividendYield':                     'Dividend Yield',
+        'ytdReturn':                         'YTD Return',
+        'oneMonthReturn':                    '1-month Return',
+        'threeMonthReturn':                  '3-month Return',
+        'oneYearReturn':                     '1-year Return',
+        'threeYearReturn':                   '3-year Return',
+        'fiveYearReturn':                    '5-year Return',
+        'tenYearReturn':                     '10-year Return',
+        'avg_annual_return':                 'Avg. Annual Return %',
+        'return_rating':                     'Avg. Return Rating (1-10)',
+        'risk_rating':                       'Avg. Risk Rating (1-10)',
+        'positionType':                      'Position Type',
+        'isLeveraged':                       'Leveraged',
+        'return_rating_text':                'Return Rating',
+        'risk_rating_text':                  'Risk Rating',
+        'annualReportExpenseRatio_rating_text': 'Expense Ratio Rating',
+        'dividendYield_rating_text':         'Dividend Yield Rating',
+        'ytdReturn_rating_text':             'YTD Return Rating',
+        'oneMonthReturn_rating_text':        '1-month Return Rating',
+        'threeMonthReturn_rating_text':      '3-month Return Rating',
+        'oneYearReturn_rating_text':         '1-year Return Rating',
+        'threeYearReturn_rating_text':       '3-year Return Rating',
+        'fiveYearReturn_rating_text':        '5-year Return Rating',
+        'tenYearReturn_rating_text':         '10-year Return Rating',
+    }
+    # Only rename columns that actually exist in df
+    valid_mapping = {k: v for k, v in mapping.items() if k in df.columns}
+    return df.rename(columns=valid_mapping)
+def get_etf_recommendations_from_list(
+    list_of_fetched_etfs: List[str],
+    df_etf: pd.DataFrame,
+    top_n: int
+) -> pd.DataFrame:
+    """
+    Filter the master ETF DataFrame down to the tickers you fetched,
+    sort by averageVolumeUSD descending, take the top_n rows,
+    select only the requested raw columns, rename them for display, and return.
+    Parameters
+    ----------
+    list_of_fetched_etfs : List[str]
+        ETF ticker symbols returned by your semantic search.
+    df_etf : pd.DataFrame
+        The full ETF DataFrame loaded from Neo4j, with raw property names.
+    top_n : int
+        How many of the highest-volume ETFs to return.
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame of the top_n ETFs (by avg volume), with only the
+        selected columns, renamed to friendly display names.
+    """
+    # 1. Keep only the tickers you fetched
+    df_filtered = df_etf[df_etf['ticker'].isin(list_of_fetched_etfs)].copy()
+    # 2. Sort by raw averageVolumeUSD descending
+    df_sorted = df_filtered.sort_values(by='averageVolumeUSD', ascending=False)
+    # 3. Take the top_n rows
+    df_top = df_sorted.head(top_n)
+    # 4. Select only the columns you asked for
+    df_selected = df_top[get_cols_to_display()]
+    # 5. Rename to friendly display names
+    df_final = rename_etf_columns(df_selected)
+    return df_final
+def format_number_short(x):
+    """
+    Converts a single number to a short format with K (thousands), M (millions),
+    B (billions), or T (trillions) suffix. Preserves NaN values.
+    Parameters:
+        x (float or int): The number to format.
+    Returns:
+        str or float: The formatted string if x is a number, or the original NaN.
+    """
+    # If the value is NaN, return it as is
+    if pd.isna(x):
+        return x
+    # Use the absolute value for comparison to handle negative numbers
+    abs_x = abs(x)
+    if abs_x < 1e3:
+        # For values less than 1,000, just return the value formatted to two decimals.
+        return f"{x:.2f}"
+    elif abs_x < 1e6:
+        # For thousands, divide by 1,000 and append 'K'
+        return f"{x/1e3:.2f}K"
+    elif abs_x < 1e9:
+        # For millions, divide by 1,000,000 and append 'M'
+        return f"{x/1e6:.2f}M"
+    elif abs_x < 1e12:
+        # For billions, divide by 1,000,000,000 and append 'B'
+        return f"{x/1e9:.2f}B"
+    else:
+        # For trillions and above, divide by 1,000,000,000,000 and append 'T'
+        return f"{x/1e12:.2f}T"
+def transform_number_columns(df, columns):
+    """
+    Transforms specified numeric columns in a DataFrame to short format strings.
+    The transformation converts numbers to their respective short formats:
+    thousands (K), millions (M), billions (B), and trillions (T).
+    NaN values are preserved.
+    Parameters:
+        df (pd.DataFrame): The input DataFrame.
+        columns (list): List of column names (as strings) to be transformed.
+    Returns:
+        pd.DataFrame: A copy of the DataFrame with the specified columns transformed.
+    """
+    # Create a copy of the DataFrame to avoid modifying the original
+    df_transformed = df.copy()
+    # Loop through each specified column
+    for col in columns:
+        if col in df_transformed.columns:
+            # Apply the formatting function to each value in the column.
+            df_transformed[col] = df_transformed[col].apply(format_number_short)
+    return df_transformed
+def transform_float_columns_to_perc(df, columns):
+    """
+    Transforms specified numeric columns in a DataFrame to short format strings.
+    The transformation converts numbers to their respective short formats:
+    thousands (K), millions (M), billions (B), and trillions (T).
+    NaN values are preserved.
+    Parameters:
+        df (pd.DataFrame): The input DataFrame.
+        columns (list): List of column names (as strings) to be transformed.
+    Returns:
+        pd.DataFrame: A copy of the DataFrame with the specified columns transformed.
+    """
+    # Create a copy of the DataFrame to avoid modifying the original
+    df_transformed = df.copy()
+    # Loop through each specified column
+    for col in columns:
+        if col in df_transformed.columns:
+            # Apply transformation: multiply by 100, format as string, preserve NaNs
+            df_transformed[col] = df_transformed[col].apply(
+                lambda x: f"{x * 100:.2f}%" if pd.notna(x) else x
+            )
+    return df_transformed
+def overview_df(df_recommendations, drop_relavance_score=True):
+    overview_cols = ["Leveraged", "Ticker", "Full Name", 'Category', 'Country', 'Total Assets', "Prev. Close",
+                                        "Avg. Volume", 'Market Cap.', "Relevance Score"]
+    existing_cols = [col for col in overview_cols if col in df_recommendations.columns]
+    df_overview = transform_number_columns(df_recommendations[existing_cols], ['Total Assets', 'Market Cap.'])
+    df_overview = transform_float_columns_to_perc(df_overview, columns=['Relevance Score'])
+    if drop_relavance_score:
+        df_overview = df_overview.drop(['Relevance Score'], axis=1)
+    return df_overview
+def transform_return_columns(df, cols=None):
+    """
+    Transforms float values to percentage strings for all columns ending with 'Return'.
+    For each column in the DataFrame whose name ends with 'Return', the function
+    multiplies each non-NaN float value by 100 and formats it as a string with two
+    decimal places followed by a percent sign. NaN values are preserved.
+    Parameters:
+        df (pd.DataFrame): The input DataFrame.
+    Returns:
+        pd.DataFrame: A copy of the DataFrame with transformed 'Return' columns.
+    """
+    # Create a copy of the DataFrame to avoid modifying the original
+    df_transformed = df.copy()
+    # Loop through each column in the DataFrame
+    for col in df_transformed.columns:
+        # Check if the column name ends with 'Return'
+        if col.endswith('Return'):
+            # Apply transformation: multiply by 100, format as string, preserve NaNs
+            df_transformed[col] = df_transformed[col].apply(
+                lambda x: f"{x * 100:.2f}%" if pd.notna(x) else x
+            )
+    return df_transformed
+def return_df(df_recommendations):
+    # Returns
+    returns_cols = [
+                "Ticker", "Full Name", 'Category', "YTD Return", "1-month Return",
+                "3-month Return", "1-year Return", "3-year Return",
+                "5-year Return", "10-year Return"
+                ]
+    existing_cols = [col for col in returns_cols if col in df_recommendations.columns]
+    df_return = transform_return_columns(df_recommendations[existing_cols])
+    return df_return
+def clean_ratings_columns(df):
+    rating_cols = ['YTD Return Rating', '1-month Return Rating', '3-month Return Rating',
+                   '1-year Return Rating', '3-year Return Rating', '5-year Return Rating', '10-year Return Rating',
+                   'Expense Ratio Rating', 'Dividend Yield Rating']
+    strings_to_keep = ['High', 'Moderate', 'Low']
+    for col in rating_cols:
+        if col in df.columns:
+            df.loc[:, col] = df[col].copy().astype(str).apply(
+                lambda x: next((s for s in strings_to_keep if s in x), '').strip()
+            )
+    return df
+def rating_df(df_recommendations):
+    ratings_cols = [
+                    "Ticker", "Full Name", 'Category', "Avg. Return Rating (1-10)", "Avg. Risk Rating (1-10)",
+                    'Avg. Return Rating', 'Avg. Risk Rating', 'YTD Return Rating', '1-month Return Rating', '3-month Return Rating',
+                    '1-year Return Rating', '3-year Return Rating', '5-year Return Rating', '10-year Return Rating'
+                    ]
+    existing_cols = [col for col in ratings_cols if col in df_recommendations.columns]
+    df_rating = clean_ratings_columns(df_recommendations[existing_cols])
+    return df_rating
+def expense_ratio_df(df_recommendations):
+    expenses_cols = ["Ticker", "Full Name", "Category", 'Total Assets', 'Expense Ratio', 'Expense Ratio Rating']
+    existing_cols = [col for col in expenses_cols if col in df_recommendations.columns]
+    df_rec_transformed = transform_number_columns(df_recommendations[existing_cols], ['Total Assets'])
+    df_rec_transformed = transform_float_columns_to_perc(df_rec_transformed, columns=['Expense Ratio'])
+    df_rec_transformed = clean_ratings_columns(df_rec_transformed)
+    return df_rec_transformed
+def holdings_df(df_recommendations):
+    holdings_cols = ["Ticker", "Full Name", "Category", "Holdings"]
+    existing_cols = [col for col in holdings_cols if col in df_recommendations.columns]
+    return df_recommendations[existing_cols]
+def dividend_df(df_recommendations):
+    dividends_cols = ["Ticker", "Full Name", "Category", "Dividend Yield", "Dividend Yield Rating"]
+    existing_cols = [col for col in dividends_cols if col in df_recommendations.columns]
+    df_rec_transformed = clean_ratings_columns(df_recommendations[existing_cols])
+    df_rec_transformed = transform_float_columns_to_perc(df_rec_transformed, columns=['Dividend Yield'])
+    return df_rec_transformed
+def display_matching_etfs(df_recommendations):
+    if not df_recommendations.empty:
+        # st.write("Below are the **most recent ETF recommendations** we found:")
+        # Create tabs for each column group
+        tabs = st.tabs(["Overview", "Returns", "Ratings", 'Holdings', 'Expenses', 'Dividends'])
+        # Overview
+        with tabs[0]:
+            st.dataframe(overview_df(df_recommendations), hide_index=True)
+        # Returns
+        with tabs[1]:
+            st.dataframe(return_df(df_recommendations), hide_index=True)
+        # Ratings
+        with tabs[2]:
+            st.dataframe(rating_df(df_recommendations), hide_index=True)
+        # Holdings
+        with tabs[3]:
+            st.dataframe(holdings_df(df_recommendations), hide_index=True)
+        # Expenses
+        with tabs[4]:
+            st.dataframe(expense_ratio_df(df_recommendations), hide_index=True)
+        # Dividend
+        with tabs[5]:
+            st.dataframe(dividend_df(df_recommendations), hide_index=True)
+    return
+def compare_etfs_interactive(etf_a, etf_b):
+    """
+    Fetches 5-year historical price data for two ETFs from Yahoo Finance,
+    calculates percentage change from the starting price, and returns a Plotly
+    figure for interactive viewing in Streamlit.
+    Parameters:
+        etf_a (str): Ticker symbol for the first ETF.
+        etf_b (str): Ticker symbol for the second ETF.
+    Returns:
+        plotly.graph_objects.Figure: Interactive Plotly figure comparing the two ETFs.
+    """
+    end_date = datetime.today()
+    start_date = end_date - timedelta(days=5 * 365)
+    # Fetch historical data for both ETFs using yahooquery.
+    tickers = Ticker(f'{etf_a} {etf_b}', asynchronous=True)
+    df = tickers.history(period='5y', interval='1d').reset_index()
+    # Filter data for each symbol
+    df_a = df[df.symbol == etf_a].rename(columns={'adjclose': 'Adj Close A'})[['date', 'Adj Close A']]
+    df_b = df[df.symbol == etf_b].rename(columns={'adjclose': 'Adj Close B'})[['date', 'Adj Close B']]
+    # Merge on date
+    df_merged = pd.merge(df_a, df_b, on='date', how='inner').set_index('date')
+    # Calculate percentage change from the first available adjusted close
+    df_merged['Pct Change A'] = (df_merged['Adj Close A'] / df_merged['Adj Close A'].iloc[0] - 1) * 100
+    df_merged['Pct Change B'] = (df_merged['Adj Close B'] / df_merged['Adj Close B'].iloc[0] - 1) * 100
+    # Create a Plotly figure
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=df_merged.index,
+            y=df_merged['Pct Change A'],
+            mode='lines',
+            name=etf_a
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=df_merged.index,
+            y=df_merged['Pct Change B'],
+            mode='lines',
+            name=etf_b
+        )
+    )
+    # Customize layout
+    fig.update_layout(
+        title=f'5-Year Performance Comparison: {etf_a} vs. {etf_b}',
+        xaxis_title='Date',
+        yaxis_title='Percentage Change (%)',
+        hovermode='x unified'
+    )
+    fig.update_xaxes(range=[start_date, end_date])
+    return fig
+def clean_etf_text(text: str) -> str:
+    """
+    Cleans and formats OCR-parsed ETF text by:
+    - Removing excessive newlines and spaces
+    - Fixing line-break hyphenations
+    - Normalizing whitespace and punctuation
+    - Removing wrapping quotes
+    """
+    # Remove leading/trailing whitespace and outer quotes if present
+    text = text.strip().strip('"').strip("'")
+    # Fix hyphenated line breaks (e.g., 'NASDAQ-\n100' -> 'NASDAQ-100')
+    text = re.sub(r'-\s*\n\s*', '-', text)
+    # Replace remaining line breaks with spaces
+    text = re.sub(r'[\n\r]+', ' ', text)
+    # Remove excessive spaces
+    text = re.sub(r'\s{2,}', ' ', text)
+    # Ensure proper spacing after periods, commas, etc.
+    text = re.sub(r'([.,!?])([^\s])', r'\1 \2', text)
+    # Capitalize the first letter if needed
+    if text and text[0].islower():
+        text = text[0].upper() + text[1:]
+    return text.strip()
+def trim_to_last_full_sentence(text: str) -> str:
+    # If it already ends cleanly, just return it
+    if text.rstrip().endswith(('.', '!', '?')):
+        return text
+    # Split on sentence boundaries: punctuation + space + uppercase
+    pattern = r'(?<=[.!?])\s+(?=[A-Z])'
+    parts = re.split(pattern, text)
+    # If we only got one part, nothing to trim
+    if len(parts) == 1:
+        return text
+    # Drop the last (incomplete) fragment and rejoin the rest
+    full_sentences = parts[:-1]
+    return ' '.join(full_sentences).strip()
+def lookup_etf_report(ticker, df_analyst_report):
+    return
+def form_display_comparison_table(df_etf, list_of_parsed_tickers):
+    cols_interests = ['Ticker', 'longName', 'categoryName', 'previousCloseUSD', 'averageVolumeUSD', 'return_rating', 'risk_rating',
+                            'ytdReturn', 'oneMonthReturn', 'threeMonthReturn', 'oneYearReturn', 'threeYearReturn', 'fiveYearReturn',
+                            'tenYearReturn', 'annualReportExpenseRatio']
+    cols_interests_pretty = ['Ticker', 'Full Name', 'Category', 'Prev. Close', 'Avg. Volume', 'Return Rating (1-10)', 'Risk Rating (1-10)',
+                                'YTD Return', '1-month Return', '3-month Return', '1-year Return', '3-year Return',
+                                '5-year Return', '10-year Return', 'Expense Ratio']
+    rename_dict = dict(zip(cols_interests, cols_interests_pretty))
+    df_comparison = df_etf[df_etf['Ticker'].isin(list_of_parsed_tickers)][cols_interests]
+    df_comparison = df_comparison.rename(columns=rename_dict)
+    df_comparison = transform_return_columns(df_comparison)
+    df_comparison = transform_float_columns_to_perc(df_comparison, columns=['Expense Ratio'])
+    df_comparison = transform_number_columns(df_comparison, ['Avg. Volume'])
+    return df_comparison
+def portfolio_interactive_chart(df_port_output):
+    # Create a Plotly figure
+    fig = go.Figure()
+    # Plot each ETF's growth as a separate line
+    for col in df_port_output.columns:
+        if col not in ["year", "Total"]:
+            fig.add_trace(go.Scatter(
+                x=df_port_output["year"],
+                y=df_port_output[col],
+                mode='lines',
+                name=col
+            ))
+    # Plot the 'Total' line, perhaps in a different style
+    fig.add_trace(go.Scatter(
+        x=df_port_output["year"],
+        y=df_port_output["Total"],
+        mode='lines',
+        name="Total",
+        # line=dict(dash='dash', color='black')
+        line=dict(dash='dash')
+    ))
+    fig.update_layout(
+        title="Portfolio Growth Over Time",
+        xaxis_title="Year",
+        yaxis_title="Portfolio Value (USD)",
+        hovermode='x unified'
+    )
+    return fig
+def set_estimated_return(tickers, df_general_info, df_annual_return):
+    """
+    Estimate the return for each ticker based on trailing and annual returns.
+    For each ticker, the function:
+      1. Extracts trailing return data from df_general_info for the columns:
+         'oneYearReturn', 'threeYearReturn', 'fiveYearReturn', and 'tenYearReturn'.
+      2. Replaces NaN values with 0 and calculates the mean of non-zero trailing returns.
+      3. Retrieves the average annual return from df_annual_return using the 'fundReturn' column.
+         If 'fundReturn' is NaN, it attempts to use the 'categoryReturn' column instead.
+      4. Uses the non-zero mean trailing return if available; otherwise, falls back to the annual return.
+    Parameters:
+        tickers (iterable): An iterable of ticker symbols to process.
+        df_general_info (pd.DataFrame): DataFrame containing general information including trailing returns.
+        df_annual_return (pd.DataFrame): DataFrame containing annual return information.
+    Returns:
+        dict: A dictionary mapping each ticker to its estimated return.
+    """
+    # Define the columns that contain the trailing returns in the general info DataFrame.
+    trailing_returns_cols = ['oneYearReturn', 'threeYearReturn', 'fiveYearReturn', 'tenYearReturn']
+    # Define the column names for annual return and category-based annual return.
+    annual_return_col = 'fundReturn'
+    cat_annual_return_col = 'categoryReturn'
+    # Dictionary to store the estimated return for each ticker.
+    # d_est_return = {}
+    ticker_collected = []
+    est_return_collected = []
+    # Loop over each ticker symbol provided in the tickers list.
+    for ticker in tickers:
+        # Extract the trailing return values for the current ticker.
+        trailing_return = df_general_info[df_general_info['Ticker'] == ticker][trailing_returns_cols].values
+        # Replace any NaN values in the trailing return array with 0.
+        trailing_return = np.nan_to_num(trailing_return, nan=0)
+        # Filter out zero values to only consider nonzero trailing returns.
+        non_zero_elements = trailing_return[trailing_return != 0]
+        # Calculate the mean of the nonzero trailing returns, if available.
+        if len(non_zero_elements) > 0:
+            non_zero_mean_trailing_return = np.mean(non_zero_elements)
         else:
+            non_zero_mean_trailing_return = 0
+        # Calculate the average annual return from the annual return DataFrame using 'fundReturn'.
+        avg_return = df_annual_return[df_annual_return['Ticker'] == ticker][annual_return_col].mean()
+        # If the annual return is NaN, try using the 'categoryReturn' column instead.
+        if pd.isnull(avg_return):
+            avg_return = df_annual_return[df_annual_return['Ticker'] == ticker][cat_annual_return_col].mean()
+            # If still NaN, default to 0.
+            if pd.isnull(avg_return):
+                avg_return = 0
+        # Choose the estimated return:
+        # If the nonzero trailing mean is 0, use the annual return (avg_return).
+        # Otherwise, use the nonzero trailing mean.
+        if non_zero_mean_trailing_return == 0:
+            est_return_collected.append(avg_return)
+            # d_est_return[ticker] = avg_return
+        else:
+            est_return_collected.append(non_zero_mean_trailing_return)
+            # d_est_return[ticker] = non_zero_mean_trailing_return
+        ticker_collected.append(ticker)
+    df = pd.DataFrame({'etf': ticker_collected, 'estimated_annual_return': est_return_collected})
+    d = df.to_dict()
+    return df, d
+def form_d_chat_history(result_id, response, task, fig=None, df=None):
+    d = {
+            "id": result_id,
+            "task": task,
+            "response": response,
+            "fig": fig,
+            "df": df
+        }
+    return d
+def portfolio_growth_over_time(df, target_years=30):
+    """
+    Calculate the portfolio value over time (yearly) for each asset in the DataFrame.
+    The DataFrame should have columns:
+        - 'etf'
+        - 'initial_investment'
+        - 'estimated_annual_return' (as percentage string like "10%" or as a decimal)
+        - 'amount_of_recurring_investments'
+    Parameters:
+        df (pd.DataFrame): Input DataFrame with asset details.
+        target_years (int): Total number of years to project (default is 30).
+    Returns:
+        portfolio_data (pd.DataFrame): DataFrame containing the portfolio value for each asset
+                                       and the total portfolio value over time.
+    """
+    years = np.arange(0, target_years + 1)  # yearly intervals from 0 to target_years
+    portfolio_data = pd.DataFrame({'year': years})
+    # Process each asset separately
+    for idx, row in df.iterrows():
+        etf = row['etf']
+        P = row['initial_investment']
+        recurring = row['amount_of_recurring_investments']
+        r = row['estimated_annual_return']
+        # Convert percentage string (if applicable) to a decimal
+        if isinstance(r, str) and '%' in r:
+            r = float(r.strip('%')) / 100.0
+        monthly_rate = r / 12
+        values = []
+        for t in years:
+            months = int(t * 12)
+            # Future value from the initial investment:
+            fv_initial = P * (1 + monthly_rate) ** months
+            # Future value from monthly contributions (annuity formula)
+            if monthly_rate != 0:
+                fv_contrib = recurring * (((1 + monthly_rate) ** months - 1) / monthly_rate)
+            else:
+                fv_contrib = recurring * months
+            total_value = fv_initial + fv_contrib
+            values.append(total_value)
+        portfolio_data[etf] = values
+    # Compute total portfolio value (summing each asset)
+    asset_columns = df['etf'].tolist()
+    portfolio_data['Total'] = portfolio_data[asset_columns].sum(axis=1)
+    last_row = portfolio_data.iloc[-1].to_dict()
+    return portfolio_data, last_row
+def run_portfolio_analysis(list_of_parsed_tickers, df_etf, df_annual_return_master):
+    # Portfolio Analysis configuration
+    target_years = 30
+    init_investment = 1000
+    recur_monthly = 100
+    df_port_input = pd.DataFrame({'etf': list_of_parsed_tickers,
+                                'initial_investment': [init_investment] * len(list_of_parsed_tickers),
+                                'amount_of_recurring_investments': [recur_monthly] * len(list_of_parsed_tickers)})
+    df_est_return, d_est_return = set_estimated_return(tickers=list_of_parsed_tickers,
+                                                                df_general_info=df_etf,
+                                                                df_annual_return=df_annual_return_master)
+    df_port_input = df_port_input.merge(df_est_return, how='left', on='etf').fillna(0)
+    df_port_output, d_summary = portfolio_growth_over_time(df=df_port_input, target_years=target_years)
+    return df_port_output