Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import zipfile | |
| import re | |
| from io import BytesIO | |
| def detect_file_type(file_path): | |
| type = file_path[-3:] | |
| if type in ["txt","zip"]: | |
| return type | |
| else: | |
| return "unknown" | |
| def preprocess_whatsapp_messages(file_path, file_type): | |
| """ | |
| Preprocesses the Whatsapp messages zip file into a Pandas Dataframe, all messages in one day go | |
| to a row and a timestamp is added. | |
| Args: | |
| file_path (str): Location of the file (zip or txt) of the conversation. | |
| Returns: | |
| str: Dataframe | |
| """ | |
| # Load the zip file and extract text data | |
| print(file_type) | |
| if file_type == "zip": | |
| with zipfile.ZipFile(file_path, 'r') as z: | |
| file_name = z.namelist()[0] | |
| with z.open(file_name) as file: | |
| text_data = file.read().decode('utf-8') | |
| else: | |
| text_data = BytesIO(file_path.getvalue()).read().decode('utf-8') | |
| # Split the text data into lines | |
| lines = text_data.strip().split('\n') | |
| # Create a DataFrame | |
| df = pd.DataFrame(lines, columns=['message']) | |
| # Process each line to separate timestamp and text | |
| df[['timestamp', 'text']] = df['message'].str.split(']', n=1, expand=True) | |
| df['timestamp'] = df['timestamp'].str.strip('[') | |
| # Handle cases where the split might not work (e.g., missing ']' in a line) | |
| df.dropna(subset=['timestamp', 'text'], inplace=True) | |
| # Convert timestamp to datetime and remove the time, keeping only the date | |
| df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%y, %H:%M:%S', errors='coerce').dt.date | |
| # Drop rows where the timestamp conversion failed (which results in NaT) | |
| df.dropna(subset=['timestamp'], inplace=True) | |
| # Remove initial WhatsApp system messages in English and Spanish | |
| filter_text_en = "Your messages and calls are end-to-end encrypted" | |
| filter_text_es = "Los mensajes y las llamadas están cifrados de extremo a extremo" | |
| df = df[~df['text'].str.contains(filter_text_en, na=False)] | |
| df = df[~df['text'].str.contains(filter_text_es, na=False)] | |
| # Additional preprocessing steps: | |
| # Remove URLs and convert text to lowercase | |
| df['text'] = df['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x)) # Remove URLs | |
| df['text'] = df['text'].apply(lambda x: x.lower()) | |
| # Remove emojis, images, stickers, documents while preserving colons after sender names | |
| df['text'] = df['text'].apply(lambda x: re.sub(r'(?<!\w)(:\s|\s:\s|\s:)', '', x)) # Remove colons that are not part of sender's name | |
| df['text'] = df['text'].apply(lambda x: re.sub(r'\[image omitted\]', '', x)) # Remove images | |
| df['text'] = df['text'].apply(lambda x: re.sub(r'\[sticker omitted\]', '', x)) # Remove stickers | |
| df['text'] = df['text'].apply(lambda x: re.sub(r'\[document omitted\]', '', x)) # Remove documents | |
| df['text'] = df['text'].apply(lambda x: re.sub(r'<se editó este mensaje.>', '', x)) # Remove editing function (new Whatsapp addition) in Spanish | |
| df['text'] = df['text'].apply(lambda x: re.sub(r'<this message was edited.>', '', x)) # Remove editing function (new Whatsapp addition) in English I AM GUESSING IDk | |
| # Group by date and concatenate all messages from the same date | |
| df = df.groupby('timestamp')['text'].apply(lambda x: '\n'.join(x)).reset_index() | |
| df.columns = ['date', 'text'] | |
| df['date'] = pd.to_datetime(df['date']) | |
| df['text'] = df['text'].astype(str) | |
| return df | |
| def get_dated_input(data, selected_date): | |
| ''' | |
| The Pandas dataframe is processed and the text is extracted. | |
| :param data: | |
| :param selected_date: | |
| :return: | |
| ''' | |
| selected_date = pd.to_datetime(selected_date) | |
| data_for_model = data[data['date'].dt.date == selected_date.date()] | |
| data_for_model.loc[:, 'text'] = data_for_model['text'] | |
| first_row_text = data_for_model['text'].iloc[0] | |
| return first_row_text | |