Spaces:
Build error
Build error
| # https://www.kaggle.com/datasets/wcukierski/enron-email-dataset | |
| from google.colab import drive | |
| drive.mount('/content/drive') | |
| # libraries | |
| #!pip install transformers --upgrade | |
| #!pip install gradio | |
| #!pip install datasets | |
| #!pip install huggingface-hub | |
| #!pip install chromadb | |
| #!pip install accelerate==0.21.0 | |
| #!pip install transformers[torch] | |
| #!pip install git+https://github.com/huggingface/accelerate.git | |
| import pandas as pd | |
| import numpy as np | |
| from transformers import AutoModel | |
| from sklearn.model_selection import train_test_split | |
| from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline | |
| import gradio as gr | |
| import chromadb | |
| from datasets import Dataset | |
| from transformers import Trainer, TrainingArguments | |
| from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling | |
| from transformers import TextDataset, DataCollatorForLanguageModeling | |
| #from transformers import TrainingArguments, Trainer | |
| #from transformers import pipeline | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| file_path = '/content/drive/MyDrive/emails.csv' | |
| df = pd.read_csv(file_path) | |
| df_columns = df.columns | |
| print(df.head(10)) | |
| messages_df = df['message'] #extract message column | |
| print(messages_df.head()) | |
| print(type(messages_df)) | |
| # Extract 1% of the content as test set so that instead of 500,000 emails 5,000 are being used as a sample. (Kept changing test size to stop colab crashing.) | |
| emails_train, emails_test = train_test_split(messages_df, test_size=0.000008, random_state=42) | |
| print(emails_test) | |
| print(type(emails_test)) | |
| pd.set_option('display.max_colwidth', None) #check content | |
| print(emails_test.head()) #first 5 rows | |
| print(type(emails_test)) | |
| # Embeddings | |
| import os | |
| # Define maximum sequence length | |
| max_seq_length = 512 | |
| # Truncate or pad sequences to the maximum length | |
| truncated_emails_test = [email[:max_seq_length] for email in emails_test] | |
| tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") | |
| model = AutoModel.from_pretrained("bert-base-uncased") | |
| embeddings_pipeline = pipeline('feature-extraction', model=model, tokenizer=tokenizer) | |
| embeddings = embeddings_pipeline(truncated_emails_test) | |
| print(type(embeddings)) | |
| #print(embeddings[:5]) #cannot see embeddings like this | |
| # to see the embeddings | |
| # Save each embedding to a separate file | |
| for i, emb in enumerate(embeddings): | |
| np.save(f"embedding_{i}.npy", emb) | |
| # Load each embedding from its corresponding file | |
| loaded_embeddings = [] | |
| for i in range(len(embeddings)): | |
| emb = np.load(f"embedding_{i}.npy") | |
| loaded_embeddings.append(emb) | |
| for i, emb in enumerate(loaded_embeddings): | |
| print(f"Embedding {i}:") | |
| print(emb) | |
| import chromadb | |
| chroma_client = chromadb.Client() | |
| collection = chroma_client.create_collection(name="michelletest") | |
| # Extract the embeddings from the nested list | |
| extracted_embeddings = [embedding[0][0] for embedding in embeddings] | |
| # Add embeddings to the ChromaDB collection | |
| collection.add( | |
| embeddings=extracted_embeddings[:5], # Add the first 5 embeddings | |
| documents=emails_test.tolist()[:5], # Add the first 5 documents | |
| metadatas=[{"source": "emails_test"} for _ in range(5)], # Metadata for the first 5 documents | |
| ids=[f"id{i}" for i in range(5)] # ID for the first 5 documents | |
| ) | |
| collection.count() #check how many in the database | |
| # Retrieve the first 2 entries from the ChromaDB database to check that it worked properly | |
| collection.get() | |
| # Convert the Series to a DataFrame | |
| emails_test_df = emails_test.to_frame() | |
| # Print the column names of the DataFrame | |
| print(emails_test_df.columns) | |
| print(emails_test_df['message']) #checking content of messsages for fine tuning the model | |
| print(emails_test_df['message'].head()) | |
| # Print the column names of the DataFrame | |
| print(emails_test_df.columns) | |
| num_entries = emails_test_df.shape[0] | |
| print("Number of entries in emails_test_df:", num_entries) | |
| # Extract 1% of the content as test set so that instead of 500,000 emails 5,000 are being used as a sample; 60 used in the end | |
| emails_train, emails_test2 = train_test_split(messages_df, test_size=0.00001, random_state=42) | |
| print(emails_test2) | |
| print(type(emails_test2)) | |
| num_entries2=emails_test2.shape[0] | |
| print("number of",num_entries2) | |
| # Convert pandas Series to a list of strings | |
| text_list = emails_test_df['message'].tolist() | |
| # Verify the type and content | |
| print(type(text_list)) | |
| print(text_list[:5]) # Print the first 5 entries as an example | |
| print(text_list[:5]) | |
| print(text_list) | |
| print(text_list[2]) #to see the content of an average mail to know what to clean up | |
| def remove_sections(email): #clean email of content that is not useful | |
| """Remove sections including original message, from, sent, to, subject line, and additional headers.""" | |
| sections_to_remove = [ | |
| "----- Original Message -----", | |
| "From:", | |
| "Sent:", | |
| "To:", | |
| "CC:", | |
| "Subject:", | |
| "Message-ID:", | |
| "Date:", | |
| "Mime-Version:", | |
| "Content-Type:", | |
| "Content-Transfer-Encoding:", | |
| "X-cc:", | |
| "X-bcc:", | |
| "X-Folder:", | |
| "X-Origin:", | |
| "X-FileName:", | |
| "-----Original Message-----" | |
| ] | |
| for section in sections_to_remove: | |
| email = [line for line in email if section not in line] | |
| return email | |
| # Remove sections from each email in the list | |
| cleaned_text_list = [remove_sections(email.split("\n")) for email in text_list] | |
| # Print out the cleaned emails to see if content looks ok | |
| for cleaned_email in cleaned_text_list: | |
| print("\n".join(cleaned_email)) | |
| print("=" * 50) # Separate each cleaned email for better readability | |
| #fine tune language model | |
| # Define the pre-trained model name (bart-base) | |
| model_name = "facebook/bart-base" | |
| # Load the tokenizer for bart-base | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Function to preprocess text_list for training | |
| def prepare_data(text_list): | |
| # Tokenize the text with padding and truncation (BART handles these well) | |
| inputs = tokenizer(text_list, padding="max_length", truncation=True) | |
| # Copy the input IDs for labels (desired output during training) | |
| labels = inputs.input_ids.copy() | |
| # Create a Dataset object from the preprocessed data | |
| return Dataset.from_dict({"input_ids": inputs["input_ids"], "labels": labels}) | |
| """Preprocesses text data for training the BART model. | |
| Args: | |
| text_list: A list of strings containing the text data. | |
| Returns: | |
| A Dataset object containing the preprocessed data. | |
| """ | |
| # Prepare your training data from the text list | |
| train_data = prepare_data(text_list) | |
| # Define the fine-tuning model (BART for sequence-to-sequence tasks) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| # Training hyperparameters (adjust as needed) | |
| batch_size = 8 | |
| learning_rate = 2e-5 | |
| num_epochs = 3 | |
| from transformers import Trainer | |
| # Define the Trainer object for training management | |
| trainer = Trainer( | |
| model=model, | |
| args=TrainingArguments( | |
| output_dir="./results", # Output directory for checkpoints etc. | |
| overwrite_output_dir=True, | |
| per_device_train_batch_size=batch_size, | |
| learning_rate=learning_rate, | |
| num_train_epochs=num_epochs, | |
| ), | |
| train_dataset=train_data, | |
| ) | |
| # Start the fine-tuning process | |
| trainer.train() | |
| # Save the fine-tuned model and tokenizer | |
| model.save_pretrained("./fine-tuned_bart") | |
| tokenizer.save_pretrained("./fine-tuned_bart") | |
| print("Fine-tuning completed! Model saved in ./fine-tuned_bart") | |
| # Fine-tuning completed! Model saved in ./fine-tuned_bart | |
| # i used a very small amount of input so that colab stopped crashing | |
| import gradio as gr | |
| from transformers import BartForQuestionAnswering, BartTokenizer | |
| # Load the fine-tuned BART model | |
| model = BartForQuestionAnswering.from_pretrained("./fine-tuned_bart") | |
| tokenizer = BartTokenizer.from_pretrained("./fine-tuned_bart") | |
| # Function to answer questions | |
| def answer_question(question): | |
| inputs = tokenizer.encode_plus(question, return_tensors="pt", max_length=512, truncation=True) | |
| input_ids = inputs["input_ids"].tolist()[0] | |
| answer_start_scores, answer_end_scores = model(**inputs) | |
| answer_start = torch.argmax(answer_start_scores) | |
| answer_end = torch.argmax(answer_end_scores) + 1 | |
| answer = tokenizer.decode(input_ids[answer_start:answer_end]) | |
| return answer | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=answer_question, | |
| inputs="text", | |
| outputs="text", | |
| title="Question Answering Model", | |
| description="Enter a question to get the answer." | |
| ) | |
| # Launch the interface | |
| iface.launch() | |