Spaces:
Runtime error
Runtime error
| # https://huggingface.co/spaces/micknikolic/enron | |
| # here are the imports | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.chains import RetrievalQA | |
| from langchain import OpenAI, VectorDBQA | |
| from langchain.document_loaders import DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import os | |
| import nltk | |
| import pytesseract | |
| import pandas as pd | |
| pd.set_option('display.max_columns',None, | |
| 'display.max_rows',None, | |
| 'display.max_colwidth',None | |
| ) | |
| import numpy as np | |
| import os | |
| import re | |
| import io | |
| import gradio | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # here is the code | |
| # data loading. | |
| # i am using a subset of the enron dataset, as it would be computationally very expensive to work with over 500k observations locally. | |
| data = pd.read_csv('subset_enron.csv',encoding='utf-8') | |
| data = data.sample(frac=0.01,random_state=12) #(5174, 2) | |
| # Text pre-processing. | |
| cleaned_message = data["message"].apply(lambda x: re.sub(r'\\{1,2}n', '', x)) | |
| content = cleaned_message.tolist() | |
| class Document: | |
| def __init__(self, page_content, metadata=None): | |
| self.page_content = page_content | |
| self.metadata = metadata if metadata is not None else {} | |
| documents = [Document(page_content) for page_content in content] | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| texts = text_splitter.split_documents(documents) | |
| openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("openai_api_key")) | |
| vStore = Chroma.from_documents(documents=texts, embedding=openAI_embeddings) | |
| # Retrieval QA | |
| model_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.getenv("openai_api_key"), | |
| temperature=0.2, | |
| top_p=0.2, | |
| max_tokens=2000), | |
| chain_type="stuff", retriever=vStore.as_retriever()) | |
| # Building Gradio based app. The Retrieval model. | |
| def get_answer(question): | |
| """ | |
| Returns the answer on a given question. | |
| Args: | |
| question (string): end-user's input. | |
| Returns: | |
| the model's answer based on the enron emails dataset. | |
| """ | |
| response = model_retrieval.run(question) | |
| return response | |
| iface = gradio.Interface( | |
| fn=get_answer, | |
| inputs=gradio.Textbox(label="Enter your question here"), | |
| outputs=[ | |
| gradio.Textbox(label="Answer")], | |
| title="Retrieval QA for the subset of the Enron dataset", | |
| examples=[ | |
| "Who are the receivers of the emails from this corpus of emails?", | |
| "What's at the center of these emails?" | |
| ] | |
| ) | |
| iface.launch() |