File size: 4,960 Bytes
fb54f5e
 
 
 
 
 
 
 
 
 
f0248c9
 
 
fb54f5e
 
 
 
 
 
 
 
f0248c9
fb54f5e
 
 
 
 
 
 
f0248c9
 
 
 
 
 
fb54f5e
 
 
f0248c9
 
 
 
b69c947
fb54f5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0248c9
fb54f5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b69c947
fb54f5e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import streamlit as st
from pathlib import Path

from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate

from typing import Dict, List, Optional
from dotenv import load_dotenv
import os, csv

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_index = os.getenv("PINECONE_INDEX")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

content_list = ["mediator country", "mediator city", "mediator state", "mediator zip code", "mediator areas of practice"]

def summarize(text):
    prompt_template = """Write a concise summary of the following context. Summary should be up to 350 characters.
    Context: "{text}"
    CONCISE SUMMARY:"""

    prompt = PromptTemplate.from_template(prompt_template)

    llm = ChatOpenAI(temperature=0, model_name="gpt-4-1106-preview", api_key=openai_api_key)
    chain = load_summarize_chain(llm, chain_type="stuff")

    llm_chain = LLMChain(llm=llm, prompt=prompt)

    stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")
    return stuff_chain.invoke([Document(page_content=text)])
    return stuff_chain.invoke([Document(page_content=text)])
    
class MetaDataCSVLoader(BaseLoader):
    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        metadata_columns: Optional[List[str]] = None,   
        content_columns: Optional[List[str]] =None ,  
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
        self.csv_args = csv_args or {}
        self.content_columns= content_columns
        self.metadata_columns = metadata_columns

    def load(self) -> List[Document]:
        docs = []
        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
            for i, row in enumerate(csv_reader):
                if self.content_columns: 
                    content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items() if k in self.content_columns)
                else: 
                    content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
                try:
                    source = (
                        row[self.source_column]
                        if self.source_column is not None
                        else self.file_path
                    )
                except KeyError:
                    raise ValueError(
                        f"Source column '{self.source_column}' not found in CSV file."
                    )
                metadata = {"source": source, "row": i}
                if self.metadata_columns:
                    for k, v in row.items():
                        if k in self.metadata_columns:
                            metadata[k] = v
                doc = Document(page_content=content, metadata=metadata)
                docs.append(doc)

        return docs


csv_file_uploaded = st.file_uploader(label="Upload your CSV File here")

if csv_file_uploaded is not None:
    def save_file_to_folder(uploadedFile):
        save_folder = 'content'
        save_path = Path(save_folder, uploadedFile.name)
        with open(save_path, mode='wb') as w:
            w.write(uploadedFile.getvalue())

        if save_path.exists():
            st.success(f'File {uploadedFile.name} is successfully saved!')
            
            with open(os.path.join('content/', csv_file_uploaded.name), 'r') as file:

                csv_reader = csv.reader(file)
                headers = next(csv_reader)

            filtered_headers= list(filter(lambda x: x != '', headers))

            loader = MetaDataCSVLoader(os.path.join('content/', csv_file_uploaded.name), 
                metadata_columns=filtered_headers, encoding = "utf-8")
            data = loader.load()
            for datum in data:
                new_content = ""
                for content in content_list:
                    new_content += f"{content}: {datum.metadata[content]}\n"
                
                datum.page_content = new_content
                # datum.metadata['mediator Biography'] = summarize(datum.metadata['mediator Biography'])
            PineconeVectorStore.from_documents(data, embeddings, index_name=pinecone_index)

    save_file_to_folder(csv_file_uploaded)