Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """assessment3_Maria_Maraki.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1jm_hI8O4Y0HgNNdWLnkLBIjlzSaGwwBS | |
| """ | |
| ########################################################################################################################################################### | |
| #The provided code has undergone minor adjustments from its original source (colab enviroment) to ensure its compatibility with the Hugging Face ecosystem. | |
| ########################################################################################################################################################### | |
| """Since the dataset **emails.csv** in the [Enron Email Dataset](https://www.kaggle.com/datasets/wcukierski/enron-email-dataset/code) was too big, I split the original dataset into smaller .csv files and then chose one of the split files: ***emails_subset.csv*** | |
| This is the code I used: | |
| ``` | |
| import os | |
| import pandas as pd | |
| ``` | |
| ``` | |
| def split_csv(input_file, output_folder, chunk_size): | |
| os.makedirs(output_folder, exist_ok=True) | |
| reader = pd.read_csv(input_file, chunksize=chunk_size) | |
| for i, chunk in enumerate(reader): | |
| chunk.to_csv(os.path.join(output_folder, f"output_{i}.csv"), index=False) | |
| ``` | |
| ``` | |
| input_file = 'emails.csv' | |
| output_folder = 'split_files' | |
| ``` | |
| ``` | |
| target_size = 1000000 | |
| chunk_size = 500000 # Start with a reasonable default | |
| total_rows = sum(1 for line in open(input_file)) # Count total number of rows | |
| rows_per_chunk = max(1, total_rows * target_size // os.path.getsize(input_file)) | |
| split_csv(input_file, output_folder, rows_per_chunk) | |
| ``` | |
| P.S. I didn't do this on this notebook, cause I'm working in google colab and I couldn't upload the original file -due to its size- on my google drive. | |
| """ | |
| import pandas as pd | |
| pd.set_option('display.max_columns',None, | |
| 'display.max_rows',None, | |
| 'display.max_colwidth',None | |
| ) | |
| email_data = pd.read_csv('emails_subset.csv') | |
| email_data.head() | |
| """# Embeddings of the email dataset stored in a ChromaDB database""" | |
| import email | |
| import openai | |
| import os | |
| import numpy as np | |
| import chromadb | |
| import nltk | |
| import pytesseract | |
| import gradio as gr | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
| from langchain.chains import RetrievalQA | |
| from langchain import OpenAI, VectorDBQA | |
| from langchain.document_loaders import DirectoryLoader | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get('OPENAI_API_KEY')) | |
| content = [] | |
| for item in email_data.message: | |
| text = email.message_from_string(item) | |
| message = (text.get_payload()) | |
| cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","") | |
| content.append(cleaned_message) | |
| class Document: | |
| def __init__(self, page_content, metadata=None): | |
| self.page_content = page_content | |
| self.metadata = metadata if metadata is not None else {} | |
| documents = [Document(page_content) for page_content in content] | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| final_text = text_splitter.split_documents(documents) | |
| collection = Chroma.from_documents( | |
| documents=final_text, | |
| embedding=openAI_embeddings) | |
| """# Fine-tuning a Language Model on the Dataset | |
| The fine-tuning task kept crushing my notebook and I had to restart so I stored it into a different notebook. | |
| """ | |
| """# Gradio Interface that answers questions related to the case""" | |
| email_data_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.environ.get('OPENAI_API_KEY'), | |
| temperature=0.6, | |
| top_p=0.5, | |
| max_tokens=500), | |
| chain_type='stuff', retriever=collection.as_retriever()) | |
| def qa_retrieval(question): | |
| answer = email_data_retrieval.run(question) | |
| return answer | |
| iface = gr.Interface( | |
| fn=qa_retrieval, | |
| inputs=gr.Textbox(label="Write your question regarding the Enron Case here:"), | |
| outputs=gr.Textbox(label="Answer of the question:"), | |
| title="QA Retrieval - Case Specific: Enron Email Dataset" | |
| ) | |
| iface.launch() | |