Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- ChatAssessmentAnalysis.py +58 -0
- EmbeddingExtraction.py +41 -0
- FeatureExtraction.py +37 -0
- ObtainChatData.py +25 -0
ChatAssessmentAnalysis.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ChatAssessmentAnalysis.py
|
| 2 |
+
# Purpose: Script for analyzing chat data using machine learning models, including training, validation, and testing.
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pickle
|
| 7 |
+
import xgboost as xgb
|
| 8 |
+
from sklearn.multioutput import MultiOutputRegressor
|
| 9 |
+
from sklearn.model_selection import train_test_split
|
| 10 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
| 11 |
+
|
| 12 |
+
# Read your data file
|
| 13 |
+
datafile_path = "data/chat_transcripts_with_features.csv" # Update this path as necessary
|
| 14 |
+
df = pd.read_csv(datafile_path)
|
| 15 |
+
|
| 16 |
+
# Convert embeddings to numpy arrays
|
| 17 |
+
df['embedding'] = df['embedding'].apply(lambda x: np.array([float(num) for num in x.strip('[]').split(',')]))
|
| 18 |
+
|
| 19 |
+
# Define features (X) and labels (y) - Adjust column names as per your dataset
|
| 20 |
+
X = np.array(df['embedding'].tolist())
|
| 21 |
+
y = df[['score1', 'score2', 'score3']].values # Replace with your actual score columns
|
| 22 |
+
|
| 23 |
+
# Split data into training, validation, and testing sets
|
| 24 |
+
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
| 25 |
+
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
|
| 26 |
+
|
| 27 |
+
# Train the regression model
|
| 28 |
+
# Note: You can replace XGBRegressor with any other regression model as per your requirement.
|
| 29 |
+
# For instance, you might use RandomForestRegressor or a neural network model from Keras.
|
| 30 |
+
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.05, max_depth=4, alpha=0, lam=0.5, n_estimators=200)
|
| 31 |
+
multioutput_reg = MultiOutputRegressor(xg_reg)
|
| 32 |
+
multioutput_reg.fit(X_train, y_train)
|
| 33 |
+
|
| 34 |
+
# Save the trained model
|
| 35 |
+
model_filename = 'trained_model.pkl'
|
| 36 |
+
with open(model_filename, 'wb') as file:
|
| 37 |
+
pickle.dump(multioutput_reg, file)
|
| 38 |
+
print(f"Model trained and saved as {model_filename}")
|
| 39 |
+
|
| 40 |
+
# Validate the model
|
| 41 |
+
# Note: You can use other metrics for validation based on your specific needs.
|
| 42 |
+
# For instance, you might consider using precision, recall, F1-score, or ROC-AUC for classification tasks.
|
| 43 |
+
val_preds = multioutput_reg.predict(X_val)
|
| 44 |
+
val_mse = mean_squared_error(y_val, val_preds)
|
| 45 |
+
val_mae = mean_absolute_error(y_val, val_preds)
|
| 46 |
+
print(f"Validation MSE: {val_mse:.2f}, Validation MAE: {val_mae:.2f}")
|
| 47 |
+
|
| 48 |
+
# Test the model
|
| 49 |
+
test_preds = multioutput_reg.predict(X_test)
|
| 50 |
+
test_mse = mean_squared_error(y_test, test_preds)
|
| 51 |
+
test_mae = mean_absolute_error(y_test, test_preds)
|
| 52 |
+
print(f"Test MSE: {test_mse:.2f}, Test MAE: {test_mae:.2f}")
|
| 53 |
+
|
| 54 |
+
# Note to Users:
|
| 55 |
+
# - Make sure to adjust the data paths and column names to match your dataset.
|
| 56 |
+
# - Feel free to experiment with different machine learning models and parameters to find the best fit for your data.
|
| 57 |
+
# - The trained model can be used to make predictions on new chat transcript data.
|
| 58 |
+
# - Consider re-training the model periodically with new data to keep it updated and improve its accuracy.
|
EmbeddingExtraction.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import openai
|
| 3 |
+
import tiktoken
|
| 4 |
+
import os
|
| 5 |
+
import config
|
| 6 |
+
from openai import OpenAI
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
load_dotenv(override=True)
|
| 9 |
+
|
| 10 |
+
client = OpenAI(
|
| 11 |
+
api_key=os.getenv("OPENAI_API_KEY")
|
| 12 |
+
)
|
| 13 |
+
# Set your OpenAI API key
|
| 14 |
+
|
| 15 |
+
# Embedding model parameters
|
| 16 |
+
embedding_model = "text-embedding-ada-002"
|
| 17 |
+
embedding_encoding = "cl100k_base"
|
| 18 |
+
max_tokens = 8000
|
| 19 |
+
|
| 20 |
+
# Function to get embeddings
|
| 21 |
+
def get_embedding(text, model="text-embedding-3-small"):
|
| 22 |
+
text = text.replace("\n", " ")
|
| 23 |
+
return client.embeddings.create(input = [text], model=model).data[0].embedding
|
| 24 |
+
|
| 25 |
+
# Load preprocessed chat transcript data
|
| 26 |
+
input_datapath = "../data/processed_chat_data.csv"
|
| 27 |
+
output_datapath = "../data/chat_transcripts_with_embeddings.csv"
|
| 28 |
+
df = pd.read_csv(input_datapath)
|
| 29 |
+
|
| 30 |
+
# Ensure your chat transcripts are within the token limit for embedding
|
| 31 |
+
encoding = tiktoken.get_encoding(embedding_encoding)
|
| 32 |
+
df["n_tokens"] = df["transcript"].apply(lambda x: len(encoding.encode(x)))
|
| 33 |
+
df = df[df["n_tokens"] <= max_tokens]
|
| 34 |
+
|
| 35 |
+
# Extract embeddings for each chat transcript
|
| 36 |
+
print("Extracting embeddings...")
|
| 37 |
+
df["embedding"] = df["transcript"].apply(lambda x: get_embedding(x, embedding_model))
|
| 38 |
+
|
| 39 |
+
# Save the data with embeddings
|
| 40 |
+
df.to_csv(output_datapath, index=False)
|
| 41 |
+
print(f"Data with embeddings saved to {output_datapath}")
|
FeatureExtraction.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FeatureExtraction.py
|
| 2 |
+
# Purpose: Script to extract additional features from chat transcripts for psychological assessments.
|
| 3 |
+
|
| 4 |
+
# Imports
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from textblob import TextBlob
|
| 8 |
+
|
| 9 |
+
# Function to calculate sentiment polarity
|
| 10 |
+
def get_sentiment_polarity(text):
|
| 11 |
+
return TextBlob(text).sentiment.polarity
|
| 12 |
+
|
| 13 |
+
# Function to calculate sentiment subjectivity
|
| 14 |
+
def get_sentiment_subjectivity(text):
|
| 15 |
+
return TextBlob(text).sentiment.subjectivity
|
| 16 |
+
|
| 17 |
+
# Load data with embeddings
|
| 18 |
+
input_datapath = "data/chat_transcripts_with_embeddings.csv"
|
| 19 |
+
output_datapath = "data/chat_transcripts_with_features.csv"
|
| 20 |
+
df = pd.read_csv(input_datapath)
|
| 21 |
+
|
| 22 |
+
# Feature Extraction
|
| 23 |
+
# Example: Extracting sentiment polarity and subjectivity
|
| 24 |
+
df['sentiment_polarity'] = df['chathistory'].apply(get_sentiment_polarity)
|
| 25 |
+
df['sentiment_subjectivity'] = df['chathistory'].apply(get_sentiment_subjectivity)
|
| 26 |
+
|
| 27 |
+
# TODO: Add any additional feature extraction relevant to your study here.
|
| 28 |
+
# Example: df['feature_name'] = df['column'].apply(your_custom_function)
|
| 29 |
+
|
| 30 |
+
# Save the data with additional features
|
| 31 |
+
df.to_csv(output_datapath, index=False)
|
| 32 |
+
print(f"Data with additional features saved to {output_datapath}")
|
| 33 |
+
|
| 34 |
+
# Note to Users:
|
| 35 |
+
# - Ensure that 'input_datapath' points to your data file with embeddings.
|
| 36 |
+
# - This script uses TextBlob for sentiment analysis. Install it using 'pip install textblob' if not already installed.
|
| 37 |
+
# - You can add more feature extraction functions as needed for your specific research requirements.
|
ObtainChatData.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ObtainChatData.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
# Load & inspect dataset
|
| 5 |
+
input_datapath = "../data/chatbotdata.csv"
|
| 6 |
+
output_datapath = "../data/processed_chat_data.csv"
|
| 7 |
+
df = pd.read_csv(input_datapath)
|
| 8 |
+
print("Initial Data Loaded. Here's a preview:")
|
| 9 |
+
print(df.head(2))
|
| 10 |
+
|
| 11 |
+
# Data Preprocessing
|
| 12 |
+
df = df.dropna()
|
| 13 |
+
print(f"Data after cleaning: {len(df)} entries")
|
| 14 |
+
|
| 15 |
+
# Combine all transcripts from the same user into a single transcript
|
| 16 |
+
df_combined = df.groupby('user_id')['transcript'].apply(' '.join).reset_index()
|
| 17 |
+
|
| 18 |
+
# Save the processed data
|
| 19 |
+
df_combined.to_csv(output_datapath, index=False)
|
| 20 |
+
print(f"Processed data saved to {output_datapath}")
|
| 21 |
+
|
| 22 |
+
# Note to Users:
|
| 23 |
+
# - Ensure that your data file path is correctly specified in 'input_datapath'.
|
| 24 |
+
# - The script currently includes a basic data cleaning step (dropping missing values). Depending on your data, you may need to add more preprocessing steps.
|
| 25 |
+
# - The processed data is saved in CSV format. Make sure to use this processed file in subsequent analysis scripts.
|