AjithKSenthil commited on
Commit
d2e169c
·
verified ·
1 Parent(s): 66cbf2b

Upload 5 files

Browse files
ChatAssessmentAnalysis.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ChatAssessmentAnalysis.py
2
+ # Purpose: Script for analyzing chat data using machine learning models, including training, validation, and testing.
3
+
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+ import xgboost as xgb
8
+ from sklearn.multioutput import MultiOutputRegressor
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
11
+
12
+ # Read your data file
13
+ datafile_path = "data/chat_transcripts_with_features.csv" # Update this path as necessary
14
+ df = pd.read_csv(datafile_path)
15
+
16
+ # Convert embeddings to numpy arrays
17
+ df['embedding'] = df['embedding'].apply(lambda x: np.array([float(num) for num in x.strip('[]').split(',')]))
18
+
19
+ # Define features (X) and labels (y) - Adjust column names as per your dataset
20
+ X = np.array(df['embedding'].tolist())
21
+ y = df[['score1', 'score2', 'score3']].values # Replace with your actual score columns
22
+
23
+ # Split data into training, validation, and testing sets
24
+ X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
25
+ X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
26
+
27
+ # Train the regression model
28
+ # Note: You can replace XGBRegressor with any other regression model as per your requirement.
29
+ # For instance, you might use RandomForestRegressor or a neural network model from Keras.
30
+ xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.05, max_depth=4, alpha=0, lam=0.5, n_estimators=200)
31
+ multioutput_reg = MultiOutputRegressor(xg_reg)
32
+ multioutput_reg.fit(X_train, y_train)
33
+
34
+ # Save the trained model
35
+ model_filename = 'trained_model.pkl'
36
+ with open(model_filename, 'wb') as file:
37
+ pickle.dump(multioutput_reg, file)
38
+ print(f"Model trained and saved as {model_filename}")
39
+
40
+ # Validate the model
41
+ # Note: You can use other metrics for validation based on your specific needs.
42
+ # For instance, you might consider using precision, recall, F1-score, or ROC-AUC for classification tasks.
43
+ val_preds = multioutput_reg.predict(X_val)
44
+ val_mse = mean_squared_error(y_val, val_preds)
45
+ val_mae = mean_absolute_error(y_val, val_preds)
46
+ print(f"Validation MSE: {val_mse:.2f}, Validation MAE: {val_mae:.2f}")
47
+
48
+ # Test the model
49
+ test_preds = multioutput_reg.predict(X_test)
50
+ test_mse = mean_squared_error(y_test, test_preds)
51
+ test_mae = mean_absolute_error(y_test, test_preds)
52
+ print(f"Test MSE: {test_mse:.2f}, Test MAE: {test_mae:.2f}")
53
+
54
+ # Note to Users:
55
+ # - Make sure to adjust the data paths and column names to match your dataset.
56
+ # - Feel free to experiment with different machine learning models and parameters to find the best fit for your data.
57
+ # - The trained model can be used to make predictions on new chat transcript data.
58
+ # - Consider re-training the model periodically with new data to keep it updated and improve its accuracy.
EmbeddingExtraction.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import openai
3
+ import tiktoken
4
+ import os
5
+ import config
6
+ from openai import OpenAI
7
+ from dotenv import load_dotenv
8
+ load_dotenv(override=True)
9
+
10
+ client = OpenAI(
11
+ api_key=os.getenv("OPENAI_API_KEY")
12
+ )
13
+ # Set your OpenAI API key
14
+
15
+ # Embedding model parameters
16
+ embedding_model = "text-embedding-ada-002"
17
+ embedding_encoding = "cl100k_base"
18
+ max_tokens = 8000
19
+
20
+ # Function to get embeddings
21
+ def get_embedding(text, model="text-embedding-3-small"):
22
+ text = text.replace("\n", " ")
23
+ return client.embeddings.create(input = [text], model=model).data[0].embedding
24
+
25
+ # Load preprocessed chat transcript data
26
+ input_datapath = "../data/processed_chat_data.csv"
27
+ output_datapath = "../data/chat_transcripts_with_embeddings.csv"
28
+ df = pd.read_csv(input_datapath)
29
+
30
+ # Ensure your chat transcripts are within the token limit for embedding
31
+ encoding = tiktoken.get_encoding(embedding_encoding)
32
+ df["n_tokens"] = df["transcript"].apply(lambda x: len(encoding.encode(x)))
33
+ df = df[df["n_tokens"] <= max_tokens]
34
+
35
+ # Extract embeddings for each chat transcript
36
+ print("Extracting embeddings...")
37
+ df["embedding"] = df["transcript"].apply(lambda x: get_embedding(x, embedding_model))
38
+
39
+ # Save the data with embeddings
40
+ df.to_csv(output_datapath, index=False)
41
+ print(f"Data with embeddings saved to {output_datapath}")
FeatureExtraction.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FeatureExtraction.py
2
+ # Purpose: Script to extract additional features from chat transcripts for psychological assessments.
3
+
4
+ # Imports
5
+ import pandas as pd
6
+ import numpy as np
7
+ from textblob import TextBlob
8
+
9
+ # Function to calculate sentiment polarity
10
+ def get_sentiment_polarity(text):
11
+ return TextBlob(text).sentiment.polarity
12
+
13
+ # Function to calculate sentiment subjectivity
14
+ def get_sentiment_subjectivity(text):
15
+ return TextBlob(text).sentiment.subjectivity
16
+
17
+ # Load data with embeddings
18
+ input_datapath = "data/chat_transcripts_with_embeddings.csv"
19
+ output_datapath = "data/chat_transcripts_with_features.csv"
20
+ df = pd.read_csv(input_datapath)
21
+
22
+ # Feature Extraction
23
+ # Example: Extracting sentiment polarity and subjectivity
24
+ df['sentiment_polarity'] = df['chathistory'].apply(get_sentiment_polarity)
25
+ df['sentiment_subjectivity'] = df['chathistory'].apply(get_sentiment_subjectivity)
26
+
27
+ # TODO: Add any additional feature extraction relevant to your study here.
28
+ # Example: df['feature_name'] = df['column'].apply(your_custom_function)
29
+
30
+ # Save the data with additional features
31
+ df.to_csv(output_datapath, index=False)
32
+ print(f"Data with additional features saved to {output_datapath}")
33
+
34
+ # Note to Users:
35
+ # - Ensure that 'input_datapath' points to your data file with embeddings.
36
+ # - This script uses TextBlob for sentiment analysis. Install it using 'pip install textblob' if not already installed.
37
+ # - You can add more feature extraction functions as needed for your specific research requirements.
ObtainChatData.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ObtainChatData.py
2
+ import pandas as pd
3
+
4
+ # Load & inspect dataset
5
+ input_datapath = "../data/chatbotdata.csv"
6
+ output_datapath = "../data/processed_chat_data.csv"
7
+ df = pd.read_csv(input_datapath)
8
+ print("Initial Data Loaded. Here's a preview:")
9
+ print(df.head(2))
10
+
11
+ # Data Preprocessing
12
+ df = df.dropna()
13
+ print(f"Data after cleaning: {len(df)} entries")
14
+
15
+ # Combine all transcripts from the same user into a single transcript
16
+ df_combined = df.groupby('user_id')['transcript'].apply(' '.join).reset_index()
17
+
18
+ # Save the processed data
19
+ df_combined.to_csv(output_datapath, index=False)
20
+ print(f"Processed data saved to {output_datapath}")
21
+
22
+ # Note to Users:
23
+ # - Ensure that your data file path is correctly specified in 'input_datapath'.
24
+ # - The script currently includes a basic data cleaning step (dropping missing values). Depending on your data, you may need to add more preprocessing steps.
25
+ # - The processed data is saved in CSV format. Make sure to use this processed file in subsequent analysis scripts.