MediateChatbot / embed.py
Artem0729's picture
Upload 15 files
b69c947 verified
import streamlit as st
from pathlib import Path
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from typing import Dict, List, Optional
from dotenv import load_dotenv
import os, csv
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_index = os.getenv("PINECONE_INDEX")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
content_list = ["mediator country", "mediator city", "mediator state", "mediator zip code", "mediator areas of practice"]
def summarize(text):
prompt_template = """Write a concise summary of the following context. Summary should be up to 350 characters.
Context: "{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)
llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview", api_key=openai_api_key)
chain = load_summarize_chain(llm, chain_type="stuff")
llm_chain = LLMChain(llm=llm, prompt=prompt)
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
return stuff_chain.invoke([Document(page_content=text)])
return stuff_chain.invoke([Document(page_content=text)])
class MetaDataCSVLoader(BaseLoader):
def __init__(
self,
file_path: str,
source_column: Optional[str] = None,
metadata_columns: Optional[List[str]] = None,
content_columns: Optional[List[str]] =None ,
csv_args: Optional[Dict] = None,
encoding: Optional[str] = None,
):
self.file_path = file_path
self.source_column = source_column
self.encoding = encoding
self.csv_args = csv_args or {}
self.content_columns= content_columns
self.metadata_columns = metadata_columns
def load(self) -> List[Document]:
docs = []
with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
for i, row in enumerate(csv_reader):
if self.content_columns:
content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items() if k in self.content_columns)
else:
content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
try:
source = (
row[self.source_column]
if self.source_column is not None
else self.file_path
)
except KeyError:
raise ValueError(
f"Source column '{self.source_column}' not found in CSV file."
)
metadata = {"source": source, "row": i}
if self.metadata_columns:
for k, v in row.items():
if k in self.metadata_columns:
metadata[k] = v
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
return docs
csv_file_uploaded = st.file_uploader(label="Upload your CSV File here")
if csv_file_uploaded is not None:
def save_file_to_folder(uploadedFile):
save_folder = 'content'
save_path = Path(save_folder, uploadedFile.name)
with open(save_path, mode='wb') as w:
w.write(uploadedFile.getvalue())
if save_path.exists():
st.success(f'File {uploadedFile.name} is successfully saved!')
with open(os.path.join('content/', csv_file_uploaded.name), 'r') as file:
csv_reader = csv.reader(file)
headers = next(csv_reader)
filtered_headers= list(filter(lambda x: x != '', headers))
loader = MetaDataCSVLoader(os.path.join('content/', csv_file_uploaded.name),
metadata_columns=filtered_headers, encoding = "utf-8")
data = loader.load()
for datum in data:
new_content = ""
for content in content_list:
new_content += f"{content}: {datum.metadata[content]}\n"
datum.page_content = new_content
# datum.metadata['mediator Biography'] = summarize(datum.metadata['mediator Biography'])
PineconeVectorStore.from_documents(data, embeddings, index_name=pinecone_index)
save_file_to_folder(csv_file_uploaded)