import html import json import os import pandas as pd from snowflake.snowpark import Session from bs4 import BeautifulSoup from Messaging_system.Permes import Permes import streamlit as st from Messaging_system.SnowFlakeConnection import SnowFlakeConn from Messaging_system.context_validator import Validator # -------------------------------------------------------------- def load_config_(file_path): """ Loads configuration JSON files from the local space. (mostly for loading the Snowflake connection parameters) :param file_path: local path to the JSON file :return: JSON file """ with open(file_path, 'r') as file: return json.load(file) # -------------------------------------------------------------- def clean_html_tags(users_df): """ accept the data as a Pandas Dataframe and return the preprocessed dataframe. This function has access to the columns that contain HTML tags and codes, Therefore it will apply cleaning procedures to those columns. functions to preprocess the data :return: updates users_df """ for col in users_df.columns: # Apply the cleaning function to each cell in the column users_df[col] = users_df[col].apply(clean_text) return users_df # -------------------------------------------------------------- def clean_text(text): if isinstance(text, str): # Unescape HTML entities text = html.unescape(text) # Parse HTML and get text soup = BeautifulSoup(text, "html.parser") return soup.get_text() else: return text # ============================================================= def get_credential(key): return st.secrets.get(key) or os.getenv(key) # -------------------------------------------------------------- def filter_validated_users(users): """ Filters the input DataFrame by removing rows where the 'valid' column has the value 'False'. Parameters: users (DataFrame): A pandas DataFrame with a 'valid' column containing strings 'True' or 'False'. Returns: DataFrame: A filtered DataFrame containing only rows where 'valid' is 'True'. """ # Convert the 'valid' column to boolean for easier filtering users['valid'] = users['valid'].map({'True': True, 'False': False}) # Filter the DataFrame to include only rows where 'valid' is True filtered_users = users[users['valid']] # Optional: Reset the index of the filtered DataFrame filtered_users = filtered_users.reset_index(drop=True) return filtered_users # -------------------------------------------------------------- if __name__ == "__main__": # path to sample data # path = "Data/Singeo_Camp.csv" path = "Data/Test_users.csv" # loading sample data users = pd.read_csv(path) # users = clean_html_tags(users) config_file_path = 'Config_files/message_system_config.json' config_file = load_config_(config_file_path) openai_api_key = get_credential("OPENAI_API") conn = dict( user=get_credential("snowflake_user"), password=get_credential("snowflake_password"), account=get_credential("snowflake_account"), role=get_credential("snowflake_role"), database=get_credential("snowflake_database"), warehouse=get_credential("snowflake_warehouse"), schema=get_credential("snowflake_schema") ) # -------------------- # #Do we need to validate user-generated context? # user_generated_context = True # input_validator = Validator(api_key=openai_api_key) # input_validator.set_validator_instructions() # users = input_validator.validate_dataframe(dataframe=users, target_column="forum_content") # users = filter_validated_users(users) # -------------------- session = Session.builder.configs(conn).create() brand = "singeo" identifier_column = "email" snowflake = SnowFlakeConn(session=session, brand=brand) # users = snowflake.get_users_in_campaign(brand=brand) segment_info = """Student who haven't practiced for a few days""" # sample inputs CTA = """The goal is to tell them to practice singing""" # additional_instructions = """Include weeks_since _last_interaction in the message if you can create a better message to re-engage the user.""" additional_instructions = None recsys_contents = ["workout", "course", "quick_tips"] # number_of_samples = users.shape[0] number_of_samples = 10 # number of messages to generate number_of_messages = 3 ex1 = """ Header: Sing your heart out Message: Your next lesson is waiting. Get back to singing today! Header: Here Comes The Sun Message: A quick practice session will light up your day. Let’s get right back at it. Header: Ain’t No Mountain High Enough Message: Daily practice makes you unstoppable. Time to build your skills! """ sample_example = ex1 ex2 = """ Header: Get Back On Track! Message: Join thousands of singers in reaching their goals. Take a lesson today! Header: It’s Been A While Message: We haven’t seen you in a bit. Slide back into your practice sessions now. Header: It Only Takes 10 Minutes Message: Build your momentum and get you back into the groove! Header: Let’s Sing! Message: Haven’t practiced yet? This will get you going. """ ex3 = """ Header: We Miss Your Singing! Message: You haven’t practiced for 25 days. It’s time to dive back in! Header: Lost In Your Singing Journey? Message: These lessons are curated just for you. Start singing today! """ subsequent_examples = { 2: ex2, 3: ex3 } involve_recsys_result = True involve_last_interaction = False # messaging_mode = "recommend_playlist" platform = "push" selected_source_features = None selected_input_features = None segment_name = "no_recent_activity" permes = Permes() # o3-mini o1-mini o4-mini o1 users_message = permes.create_personalize_messages(session=session, model="gpt-4.1-mini", users=users, brand=brand, config_file=config_file, openai_api_key=openai_api_key, segment_info=segment_info, number_of_samples=number_of_samples, number_of_messages=number_of_messages, subsequent_examples = subsequent_examples, platform=platform, involve_recsys_result=involve_recsys_result, identifier_column=identifier_column, recsys_contents=recsys_contents, sample_example=sample_example, segment_name=segment_name, personalization=True) users_message.to_csv(f"Singeo_camp.csv", encoding='utf-8-sig', index=False)