Spaces:

ieuniversity
/

Whatsapp_Analysis_Tool

Runtime error

App Files Files Community

hannahisrael03 commited on Apr 15, 2024

Commit

bd9abb2

verified ·

1 Parent(s): 94b6be6

Upload 4 files

Browse files

Files changed (4) hide show

app.py +76 -0
model_functions.py +102 -0
preprocessor.py +94 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from model_functions import *
+from preprocessor import *
+import streamlit as st
+import pandas as pd
+def main():
+    st.title("WhatsApp Analysis Tool")
+    st.markdown("This app summarizes Whatsapp chats and provides named entity recognition as well as sentiment analysis for the conversation")
+    st.markdown("**NOTE**: *This app can only receive chats downloaded from IOS as the downloaded chat format is different than from Android.*")
+    st.markdown("Download your whatsapp chat by going to Settings > Chats > Export Chat and there select the chat you want to summarize (download 'Without Media').")
+    # File uploader
+    uploaded_file = st.file_uploader("Choose a file (.zip)", type=['zip'])
+    if uploaded_file is not None:
+        file_type = detect_file_type(uploaded_file.name)
+        if file_type == "zip":
+            # Process the file
+            data = preprocess_whatsapp_messages(uploaded_file, file_type)
+            if data.empty:
+                st.write("No messages found or the file could not be processed.")
+            else:
+                # Date selector
+                date_options = data['date'].dt.strftime('%Y-%m-%d').unique()
+                selected_date = st.selectbox("Select a date for analysis:", date_options)
+                if selected_date:
+                    text_for_analysis = get_dated_input(data, selected_date)
+                    with st.expander("Show/Hide Original Conversation"):
+                        st.markdown(f"```\n{text_for_analysis}\n```", unsafe_allow_html=True)
+                    process = st.button('Process')
+                    if process:
+                        # Load models
+                        tokenizer_sentiment, model_sentiment = load_sentiment_analyzer()
+                        tokenizer_summary, model_summary = load_summarizer()
+                        pipe_ner = load_NER()
+                    # Load models
+                    tokenizer_sentiment, model_sentiment = load_sentiment_analyzer()
+                    tokenizer_summary, model_summary = load_summarizer()
+                    pipe_ner = load_NER()
+                    # Perform analysis
+                    sentiment = get_sentiment_analysis(text_for_analysis, tokenizer_sentiment, model_sentiment)
+                    summary = generate_summary(text_for_analysis, tokenizer_summary, model_summary)
+                    ner_results = get_NER(text_for_analysis, pipe_ner)
+                    # Display results
+                    st.subheader("Sentiment Analysis")
+                    st.write("Sentiment:", sentiment)
+                    st.subheader("Summary")
+                    st.write("Summary:", summary)
+                    st.subheader("Named Entity Recognition")
+                    ner_df = pd.DataFrame(ner_results, columns=["Word", "Entity Group"])
+                    st.write(ner_df)
+        else:
+            st.error("Unsupported file type. Please upload a .txt or .zip file.")
+    else:
+        st.info("Please upload a file to proceed.")
+if __name__ == "__main__":
+    main()

model_functions.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+from transformers import (AutoModelForSequenceClassification, AutoModelForSeq2SeqLM,
+                          AutoConfig, AutoModelForTokenClassification,
+                          AutoTokenizer, pipeline)
+from peft import PeftModel, PeftConfig
+def load_sentiment_analyzer():
+    tokenizer = AutoTokenizer.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2")
+    model = AutoModelForSequenceClassification.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2")
+    return tokenizer, model
+def load_summarizer():
+    config = PeftConfig.from_pretrained("marcelomoreno26/bart-large-samsum-adapter")
+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
+    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")
+    tokenizer.pad_token = tokenizer.eos_token
+    model = PeftModel.from_pretrained(model, "marcelomoreno26/bart-large-samsum-adapter", config=config)
+    model = model.merge_and_unload()
+    return tokenizer, model
+def load_NER():
+    config = AutoConfig.from_pretrained("hannahisrael03/distilbert-base-uncased-finetuned-wikiann")
+    model = AutoModelForTokenClassification.from_pretrained("hannahisrael03/distilbert-base-uncased-finetuned-wikiann",config=config)
+    tokenizer = AutoTokenizer.from_pretrained("hannahisrael03/distilbert-base-uncased-finetuned-wikiann")
+    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average")
+    return pipe
+def get_sentiment_analysis(text, tokenizer, model):
+    inputs = tokenizer(text, padding=True, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Get predicted probabilities and predicted label
+    probabilities = torch.softmax(outputs.logits, dim=1)
+    predicted_label = torch.argmax(probabilities, dim=1)
+    # Convert the predicted label tensor to a Python integer
+    predicted_label = predicted_label.item()
+    # Map predicted label index to sentiment label
+    label_dic = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
+    # Print the predicted sentiment label
+    return label_dic[predicted_label]
+def generate_summary(text, tokenizer, model):
+    prefix = "summarize: "
+    encoded_input = tokenizer.encode_plus(prefix + text, return_tensors='pt', add_special_tokens=True)
+    input_ids = encoded_input['input_ids']
+    # Check if input_ids exceed the model's max length
+    max_length = 512
+    if input_ids.shape[1] > max_length:
+        # Split the input_ids into manageable segments
+        total_summary = []
+        for i in range(0, input_ids.shape[1], max_length - 50):  # We use max_length - 50 to allow for some room for the model to generate context
+            segment_ids = input_ids[:, i:i + max_length]
+            output_ids = model.generate(segment_ids, max_length=150, num_beams=5, early_stopping=True)
+            segment_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+            total_summary.append(segment_summary)
+        # Concatenate all segment summaries
+        summary = ' '.join(total_summary)
+    else:
+        # Process as usual
+        output_ids = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
+        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return summary
+def get_NER(text, pipe):
+    # Use pipeline to predict NER
+    results = pipe(text)
+    # Filter duplicates while retaining the highest score for each entity type and word combination
+    unique_entities = {}
+    for ent in results:
+        key = (ent['entity_group'], ent['word'])
+        if key not in unique_entities or unique_entities[key]['score'] < ent['score']:
+            unique_entities[key] = ent
+    # Prepare the output, sorted by the start position to maintain the order they appear in the text
+    filtered_results = sorted(unique_entities.values(), key=lambda x: x['start'])
+    # Format the results for a table display
+    formatted_results = [[ent['word'], ent['entity_group']] for ent in filtered_results]
+    filtered_results = []
+    for entity in formatted_results:
+        if entity[1] == 'ORG':
+            # Split the 'word' by spaces and count the number of words
+            if len(entity[0].split()) <= 2:
+                filtered_results.append(entity)
+        else:
+            # Add non-ORG entities without filtering
+            filtered_results.append(entity)
+    return filtered_results

preprocessor.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import pandas as pd
+import zipfile
+import re
+from io import BytesIO
+def detect_file_type(file_path):
+    type = file_path[-3:]
+    if type in ["txt","zip"]:
+        return type
+    else:
+        return "unknown"
+def preprocess_whatsapp_messages(file_path, file_type):
+    """
+       Preprocesses the Whatsapp messages zip file into a Pandas Dataframe, all messages in one day go
+       to a row and a timestamp is added.
+       Args:
+           file_path (str): Location of the file (zip or txt) of the conversation.
+       Returns:
+           str: Dataframe
+       """
+    # Load the zip file and extract text data
+    print(file_type)
+    if file_type == "zip":
+        with zipfile.ZipFile(file_path, 'r') as z:
+            file_name = z.namelist()[0]
+            with z.open(file_name) as file:
+                text_data = file.read().decode('utf-8')
+    else:
+        text_data = BytesIO(file_path.getvalue()).read().decode('utf-8')
+    # Split the text data into lines
+    lines = text_data.strip().split('\n')
+    # Create a DataFrame
+    df = pd.DataFrame(lines, columns=['message'])
+    # Process each line to separate timestamp and text
+    df[['timestamp', 'text']] = df['message'].str.split(']', n=1, expand=True)
+    df['timestamp'] = df['timestamp'].str.strip('[')
+    # Handle cases where the split might not work (e.g., missing ']' in a line)
+    df.dropna(subset=['timestamp', 'text'], inplace=True)
+    # Convert timestamp to datetime and remove the time, keeping only the date
+    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%y, %H:%M:%S', errors='coerce').dt.date
+    # Drop rows where the timestamp conversion failed (which results in NaT)
+    df.dropna(subset=['timestamp'], inplace=True)
+    # Remove initial WhatsApp system messages in English and Spanish
+    filter_text_en = "Your messages and calls are end-to-end encrypted"
+    filter_text_es = "Los mensajes y las llamadas están cifrados de extremo a extremo"
+    df = df[~df['text'].str.contains(filter_text_en, na=False)]
+    df = df[~df['text'].str.contains(filter_text_es, na=False)]
+    # Additional preprocessing steps:
+    # Remove URLs and convert text to lowercase
+    df['text'] = df['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))  # Remove URLs
+    df['text'] = df['text'].apply(lambda x: x.lower())                        # Convert text to lowercase
+    # Remove emojis, images, stickers, documents while preserving colons after sender names
+    df['text'] = df['text'].apply(lambda x: re.sub(r'(?<!\w)(:\s|\s:\s|\s:)', '', x))  # Remove colons that are not part of sender's name
+    df['text'] = df['text'].apply(lambda x: re.sub(r'\[image omitted\]', '', x))  # Remove images
+    df['text'] = df['text'].apply(lambda x: re.sub(r'\[sticker omitted\]', '', x))  # Remove stickers
+    df['text'] = df['text'].apply(lambda x: re.sub(r'\[document omitted\]', '', x)) # Remove documents
+    df['text'] = df['text'].apply(lambda x: re.sub(r'<se editó este mensaje.>', '', x)) # Remove editing function (new Whatsapp addition) in Spanish
+    df['text'] = df['text'].apply(lambda x: re.sub(r'<this message was edited.>', '', x)) # Remove editing function (new Whatsapp addition) in English I AM GUESSING IDk
+    # Group by date and concatenate all messages from the same date
+    df = df.groupby('timestamp')['text'].apply(lambda x: '\n'.join(x)).reset_index()
+    df.columns = ['date', 'text']
+    df['date'] = pd.to_datetime(df['date'])
+    df['text'] = df['text'].astype(str)
+    return df
+def get_dated_input(data, selected_date):
+    '''
+    The Pandas dataframe is processed and the text is extracted.
+    :param data:
+    :param selected_date:
+    :return:
+    '''
+    selected_date = pd.to_datetime(selected_date)
+    data_for_model = data[data['date'].dt.date == selected_date.date()]
+    data_for_model.loc[:, 'text'] = data_for_model['text']
+    first_row_text = data_for_model['text'].iloc[0]
+    return first_row_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==2.2.2
+pandas==2.2.2
+transformers==4.39.3
+streamlit==1.33.0
+git+https://github.com/huggingface/peft.git