Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Copy of assessment3_Elina_Hemink.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW | |
| ## Create embeddings of the email dataset and store in a chromadb database | |
| """ | |
| import chromadb | |
| from chromadb.utils import embedding_functions | |
| import pandas as pd | |
| import email | |
| from sklearn.model_selection import train_test_split | |
| # Loading email.csv dataset | |
| emails = pd.read_csv('emails.csv') | |
| print(emails.head()) | |
| # Getting the content of the emails and saving to a list | |
| content_text = [] | |
| for item in emails.message: | |
| text = email.message_from_string(item) | |
| message = (text.get_payload()) | |
| cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","") | |
| content_text.append(cleaned_message) | |
| # Taking a sample of the dataset | |
| train, test = train_test_split(content_text, train_size = 0.001) # Dataset is too large to complete embedding step | |
| # Setting up ids for ChromaDB collections | |
| ids = [] | |
| for i in range(len(train)): | |
| id = 'id'+str(i+1) | |
| ids.append(id) | |
| # Creating collection | |
| client = chromadb.Client() | |
| collection = client.create_collection(name="Enron_emails") | |
| collection.add( | |
| documents = train, | |
| ids = ids | |
| ) | |
| """## Fine-tune a Language Model on the Dataset""" | |
| from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments | |
| # Load pre-trained GPT2 tokenizer and model | |
| tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
| model = GPT2LMHeadModel.from_pretrained('gpt2') | |
| # Tokenize the dataset | |
| tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
| tokenized_emails = tokenizer(train, truncation=True, padding=True) | |
| # Extract token IDs from BatchEncoding object | |
| token_ids_list = tokenized_emails['input_ids'] | |
| # Save token IDs to a text file | |
| with open('tokenized_emails.txt', 'w') as f: | |
| for token_ids in token_ids_list: | |
| f.write(' '.join(map(str, token_ids)) + '\n') | |
| # Initialize TextDataset with the file path | |
| dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128) | |
| # Define data collator | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| # Define training arguments | |
| training_args = TrainingArguments( | |
| output_dir='./output', | |
| num_train_epochs=3, | |
| per_device_train_batch_size=8, | |
| ) | |
| # Initialize Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| data_collator=data_collator, | |
| train_dataset=dataset, | |
| ) | |
| # Fine-tune the model | |
| trainer.train() | |
| # Save the fine-tuned model | |
| model.save_pretrained("/fine_tuned_model") | |
| tokenizer.save_pretrained("/fine_tuned_model") | |
| """## Create a Gradio Interface""" | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| # Initialize fine-tuned model and tokenizer | |
| model_dir= "/fine_tuned_model" | |
| tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
| model = AutoModelForCausalLM.from_pretrained(model_dir) | |
| # Create a text generation pipeline | |
| text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
| # Define question_answering function | |
| def question_answer(question): | |
| generated = text_gen(question, max_length=200, num_return_sequences=1) | |
| generated_tokens = generated[0]['generated_text'].replace(question, "") | |
| generated_token_ids = [int(token) for token in generated_tokens.strip().split()] | |
| answer = tokenizer.decode(generated_token_ids) | |
| return answer | |
| # Set up gradio interface | |
| iface = gr.Interface(fn = question_answer, inputs='text', outputs='text', title='Fine-tuned Enron Question Answering', | |
| description='Ask a question regarding the Enron case') | |
| iface.launch() | |
| """## Deploy the Gradio Interface in a Huggingface Space""" | |