Spaces:

AjithKSenthil
/

ExampleHostedChatBot

Build error

App Files Files Community

AjithKSenthil commited on May 31, 2024

Commit

d2e169c

verified ·

1 Parent(s): 66cbf2b

Upload 5 files

Browse files

Files changed (4) hide show

ChatAssessmentAnalysis.py +58 -0
EmbeddingExtraction.py +41 -0
FeatureExtraction.py +37 -0
ObtainChatData.py +25 -0

ChatAssessmentAnalysis.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# ChatAssessmentAnalysis.py
+# Purpose: Script for analyzing chat data using machine learning models, including training, validation, and testing.
+import pandas as pd
+import numpy as np
+import pickle
+import xgboost as xgb
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+# Read your data file
+datafile_path = "data/chat_transcripts_with_features.csv"  # Update this path as necessary
+df = pd.read_csv(datafile_path)
+# Convert embeddings to numpy arrays
+df['embedding'] = df['embedding'].apply(lambda x: np.array([float(num) for num in x.strip('[]').split(',')]))
+# Define features (X) and labels (y) - Adjust column names as per your dataset
+X = np.array(df['embedding'].tolist())
+y = df[['score1', 'score2', 'score3']].values  # Replace with your actual score columns
+# Split data into training, validation, and testing sets
+X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3, random_state=42)
+X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
+# Train the regression model
+# Note: You can replace XGBRegressor with any other regression model as per your requirement.
+# For instance, you might use RandomForestRegressor or a neural network model from Keras.
+xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.05, max_depth=4, alpha=0, lam=0.5, n_estimators=200)
+multioutput_reg = MultiOutputRegressor(xg_reg)
+multioutput_reg.fit(X_train, y_train)
+# Save the trained model
+model_filename = 'trained_model.pkl'
+with open(model_filename, 'wb') as file:
+    pickle.dump(multioutput_reg, file)
+print(f"Model trained and saved as {model_filename}")
+# Validate the model
+# Note: You can use other metrics for validation based on your specific needs.
+# For instance, you might consider using precision, recall, F1-score, or ROC-AUC for classification tasks.
+val_preds = multioutput_reg.predict(X_val)
+val_mse = mean_squared_error(y_val, val_preds)
+val_mae = mean_absolute_error(y_val, val_preds)
+print(f"Validation MSE: {val_mse:.2f}, Validation MAE: {val_mae:.2f}")
+# Test the model
+test_preds = multioutput_reg.predict(X_test)
+test_mse = mean_squared_error(y_test, test_preds)
+test_mae = mean_absolute_error(y_test, test_preds)
+print(f"Test MSE: {test_mse:.2f}, Test MAE: {test_mae:.2f}")
+# Note to Users:
+# - Make sure to adjust the data paths and column names to match your dataset.
+# - Feel free to experiment with different machine learning models and parameters to find the best fit for your data.
+# - The trained model can be used to make predictions on new chat transcript data.
+# - Consider re-training the model periodically with new data to keep it updated and improve its accuracy.

EmbeddingExtraction.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pandas as pd
+import openai
+import tiktoken
+import os
+import config
+from openai import OpenAI
+from dotenv import load_dotenv
+load_dotenv(override=True)
+client = OpenAI(
+  api_key=os.getenv("OPENAI_API_KEY")
+)
+# Set your OpenAI API key
+# Embedding model parameters
+embedding_model = "text-embedding-ada-002"
+embedding_encoding = "cl100k_base"
+max_tokens = 8000
+# Function to get embeddings
+def get_embedding(text, model="text-embedding-3-small"):
+   text = text.replace("\n", " ")
+   return client.embeddings.create(input = [text], model=model).data[0].embedding
+# Load preprocessed chat transcript data
+input_datapath = "../data/processed_chat_data.csv"
+output_datapath = "../data/chat_transcripts_with_embeddings.csv"
+df = pd.read_csv(input_datapath)
+# Ensure your chat transcripts are within the token limit for embedding
+encoding = tiktoken.get_encoding(embedding_encoding)
+df["n_tokens"] = df["transcript"].apply(lambda x: len(encoding.encode(x)))
+df = df[df["n_tokens"] <= max_tokens]
+# Extract embeddings for each chat transcript
+print("Extracting embeddings...")
+df["embedding"] = df["transcript"].apply(lambda x: get_embedding(x, embedding_model))
+# Save the data with embeddings
+df.to_csv(output_datapath, index=False)
+print(f"Data with embeddings saved to {output_datapath}")

FeatureExtraction.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# FeatureExtraction.py
+# Purpose: Script to extract additional features from chat transcripts for psychological assessments.
+# Imports
+import pandas as pd
+import numpy as np
+from textblob import TextBlob
+# Function to calculate sentiment polarity
+def get_sentiment_polarity(text):
+    return TextBlob(text).sentiment.polarity
+# Function to calculate sentiment subjectivity
+def get_sentiment_subjectivity(text):
+    return TextBlob(text).sentiment.subjectivity
+# Load data with embeddings
+input_datapath = "data/chat_transcripts_with_embeddings.csv"
+output_datapath = "data/chat_transcripts_with_features.csv"
+df = pd.read_csv(input_datapath)
+# Feature Extraction
+# Example: Extracting sentiment polarity and subjectivity
+df['sentiment_polarity'] = df['chathistory'].apply(get_sentiment_polarity)
+df['sentiment_subjectivity'] = df['chathistory'].apply(get_sentiment_subjectivity)
+# TODO: Add any additional feature extraction relevant to your study here.
+# Example: df['feature_name'] = df['column'].apply(your_custom_function)
+# Save the data with additional features
+df.to_csv(output_datapath, index=False)
+print(f"Data with additional features saved to {output_datapath}")
+# Note to Users:
+# - Ensure that 'input_datapath' points to your data file with embeddings.
+# - This script uses TextBlob for sentiment analysis. Install it using 'pip install textblob' if not already installed.
+# - You can add more feature extraction functions as needed for your specific research requirements.

ObtainChatData.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# ObtainChatData.py
+import pandas as pd
+# Load & inspect dataset
+input_datapath = "../data/chatbotdata.csv"
+output_datapath = "../data/processed_chat_data.csv"
+df = pd.read_csv(input_datapath)
+print("Initial Data Loaded. Here's a preview:")
+print(df.head(2))
+# Data Preprocessing
+df = df.dropna()
+print(f"Data after cleaning: {len(df)} entries")
+# Combine all transcripts from the same user into a single transcript
+df_combined = df.groupby('user_id')['transcript'].apply(' '.join).reset_index()
+# Save the processed data
+df_combined.to_csv(output_datapath, index=False)
+print(f"Processed data saved to {output_datapath}")
+# Note to Users:
+# - Ensure that your data file path is correctly specified in 'input_datapath'.
+# - The script currently includes a basic data cleaning step (dropping missing values). Depending on your data, you may need to add more preprocessing steps.
+# - The processed data is saved in CSV format. Make sure to use this processed file in subsequent analysis scripts.