Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| def load_pdf_data(file_path): | |
| data = pd.read_csv(file_path, encoding="ISO-8859-1") | |
| return data | |
| def clean_text(text): | |
| # Function to clean text | |
| text = text.lower() | |
| text = re.sub(r'\W+', ' ', text) # replace all non-alphanumeric characters with a space | |
| text = re.sub(r'\d+', '', text) # remove all digits | |
| text = text.strip() | |
| return text | |
| def preprocess_data(data): | |
| data['utterance'] = data['utterance'].apply(clean_text) | |
| return data | |
| def save_filter_intents(data, file_path): | |
| # Get the 50 most balanced intents | |
| filtered_df = data.groupby('intent', group_keys=False).apply(lambda x: x.sample(min(len(x), 15))) | |
| # Save the filtered DataFrame as a CSV file | |
| filtered_df.to_csv(file_path+'\Pager_filtered_Intents.csv', index=False) | |
| def save_cleaned_data(data, file_path): | |
| # Save the cleaned DataFrame as a CSV file | |
| data.to_csv(file_path+'\Pager_Intents_cleaned.csv', index=False) | |
| # data = load_data(r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data\Pager_Intents.csv') | |
| # cleaned_data = preprocess_data(data) | |
| # save_cleaned_data(data, r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data') | |
| # save_filter_intents(cleaned_data, r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data') | |
| # split train test | |
| def split_train_test(data, test_size=0.3): | |
| # Split the data into training and testing sets | |
| train_data = data.sample(frac=1-test_size, random_state=42) | |
| test_data = data.drop(train_data.index) | |
| print(f"Train data shape: {train_data.shape}") | |
| return train_data, test_data | |
| def save_train_test_data(train_data, test_data, file_path): | |
| # Save the training and testing data to CSV files | |
| train_data.to_csv(file_path+'\\train_data.csv', index=False) | |
| test_data.to_csv(file_path+'\\test_data.csv', index=False) | |
| # train_data, test_data = split_train_test(cleaned_data) | |
| # save_train_test_data(train_data, test_data, 'C:\\Users\\serban.tica\\Documents\\tobi_llm_intent_recognition\\data') | |