sevdeawesome commited on
Commit
2d5d138
·
1 Parent(s): 14e5f6e

fixed app to remove dumb dependencies that arent used

Browse files
Files changed (4) hide show
  1. app.deprocated +0 -133
  2. app.py +1 -12
  3. app2.py.deprocated +0 -195
  4. ingest.py +0 -208
app.deprocated DELETED
@@ -1,133 +0,0 @@
1
- '''
2
-
3
- CONFIG AND IMPORTS
4
-
5
- '''
6
- from config import default_config
7
-
8
- from types import SimpleNamespace
9
- import gradio as gr
10
- import os, random
11
- from pathlib import Path
12
- import tiktoken
13
- from getpass import getpass
14
- from rich.markdown import Markdown
15
-
16
- import openai
17
- import wandb
18
- from pprint import pprint
19
- from wandb.integration.openai import autolog
20
- from langchain.text_splitter import MarkdownHeaderTextSplitter
21
-
22
-
23
- from langchain.embeddings import OpenAIEmbeddings
24
- from langchain.vectorstores import Chroma
25
-
26
-
27
-
28
- from tenacity import (
29
- retry,
30
- stop_after_attempt,
31
- wait_random_exponential, # for exponential backoff
32
- )
33
-
34
-
35
-
36
-
37
- if os.getenv("OPENAI_API_KEY") is None:
38
- if any(['VSCODE' in x for x in os.environ.keys()]):
39
- print('Please enter password in the VS Code prompt at the top of your VS Code window!')
40
- os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
41
- openai.api_key = os.getenv("OPENAI_API_KEY", "")
42
-
43
- assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
44
- print("OpenAI API key configured")
45
-
46
-
47
-
48
-
49
-
50
-
51
- def find_nearest_neighbor(argument=""):
52
- '''
53
- INPUT:
54
- argument (str)
55
- vectorDB??
56
- RETURN the nearest neighbor in vectorDB to argument
57
- '''
58
-
59
- md = ""
60
- print(argument)
61
- directory_path = "../../safety_docs"
62
-
63
- for filename in os.listdir(directory_path):
64
- if filename.endswith(".md"):
65
- with open(os.path.join(directory_path, filename), 'r') as file:
66
- content = file.read()
67
- md = md + content
68
-
69
- markdown_document = md
70
-
71
- headers_to_split_on = [
72
- ("#", "Header 1"),
73
- ("##", "Header 2"),
74
- ("###", "Header 3"),
75
- ]
76
-
77
- markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
78
- md_header_splits = markdown_splitter.split_text(markdown_document)
79
-
80
- embeddings = OpenAIEmbeddings()
81
- db = Chroma.from_documents(md_header_splits, embeddings)
82
-
83
- retriever = db.as_retriever(search_kwargs=dict(k=1))
84
-
85
- docs = retriever.get_relevant_documents(argument)
86
-
87
- return docs[0].metadata["Header 1"]
88
-
89
-
90
-
91
- def get_gpt_response(argument, user_prompt, system_prompt=default_config.system_prompt, model=default_config.model_name, n=1, max_tokens=200):
92
- '''
93
- INPUT:
94
- Argument
95
- user_prompt
96
- system_prompt
97
- model
98
- '''
99
-
100
- @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2))
101
- def completion_with_backoff(**kwargs):
102
- return openai.ChatCompletion.create(**kwargs)
103
-
104
- messages=[
105
- {"role": "system", "content": system_prompt},
106
- {"role": "user", "content": user_prompt},
107
- ]
108
- responses = completion_with_backoff(
109
- model=model,
110
- messages=messages,
111
- n = n,
112
- max_tokens=max_tokens
113
- )
114
- for response in responses.choices:
115
- generation = response.message.content
116
- return generation
117
-
118
-
119
- def greet(argument):
120
- nearest_neighbor = find_nearest_neighbor(argument)
121
- user_prompt = default_config.user_prompt_1 + argument + default_config.user_prompt_2
122
- response = get_gpt_response(argument, user_prompt)
123
- return "Hello " + argument + "\n nice argument, it actually is a common one: " + nearest_neighbor + "\n gpt response: \n" + response
124
-
125
-
126
- demo = gr.Interface(
127
- fn=greet,
128
- inputs=gr.Textbox(lines=2, placeholder="poob here"),
129
- outputs="text"
130
- )
131
-
132
- demo.queue(max_size=20)
133
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -11,27 +11,16 @@ import os, random
11
  from pathlib import Path
12
  import tiktoken
13
  from getpass import getpass
14
- from rich.markdown import Markdown
15
 
16
  import openai
17
- import wandb
18
- from pprint import pprint
19
- from wandb.integration.openai import autolog
20
  from langchain.text_splitter import MarkdownHeaderTextSplitter
21
  import numpy as np
22
 
23
  from langchain.embeddings import OpenAIEmbeddings
24
- from langchain.vectorstores import Chroma
25
 
26
 
27
 
28
- from tenacity import (
29
- retry,
30
- stop_after_attempt,
31
- wait_random_exponential, # for exponential backoff
32
- )
33
-
34
-
35
 
36
 
37
  if os.getenv("OPENAI_API_KEY") is None:
 
11
  from pathlib import Path
12
  import tiktoken
13
  from getpass import getpass
 
14
 
15
  import openai
 
 
 
16
  from langchain.text_splitter import MarkdownHeaderTextSplitter
17
  import numpy as np
18
 
19
  from langchain.embeddings import OpenAIEmbeddings
20
+ # from langchain.vectorstores import Chroma
21
 
22
 
23
 
 
 
 
 
 
 
 
24
 
25
 
26
  if os.getenv("OPENAI_API_KEY") is None:
app2.py.deprocated DELETED
@@ -1,195 +0,0 @@
1
- '''
2
-
3
- CONFIG AND IMPORTS
4
-
5
- '''
6
- from config import default_config
7
-
8
- from types import SimpleNamespace
9
- import gradio as gr
10
- import os, random
11
- from pathlib import Path
12
- import tiktoken
13
- from getpass import getpass
14
- from rich.markdown import Markdown
15
-
16
- import openai
17
- import wandb
18
- from pprint import pprint
19
- from wandb.integration.openai import autolog
20
- from langchain.text_splitter import MarkdownHeaderTextSplitter
21
-
22
-
23
- from langchain.embeddings import OpenAIEmbeddings
24
- from langchain.vectorstores import Chroma
25
-
26
-
27
-
28
- from tenacity import (
29
- retry,
30
- stop_after_attempt,
31
- wait_random_exponential, # for exponential backoff
32
- )
33
-
34
-
35
-
36
-
37
- if os.getenv("OPENAI_API_KEY") is None:
38
- if any(['VSCODE' in x for x in os.environ.keys()]):
39
- print('Please enter password in the VS Code prompt at the top of your VS Code window!')
40
- os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
41
- openai.api_key = os.getenv("OPENAI_API_KEY", "")
42
-
43
- assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
44
- print("OpenAI API key configured")
45
-
46
-
47
-
48
-
49
-
50
-
51
- def find_nearest_neighbor(argument="", max_args_in_output=3):
52
- '''
53
- INPUT:
54
- argument (string)
55
-
56
- RETURN the nearest neighbor(s) in vectorDB to argument as string
57
- '''
58
-
59
- md = ""
60
- print(argument)
61
- directory_path = "../../safety_docs"
62
-
63
- for filename in os.listdir(directory_path):
64
- if filename.endswith(".md"):
65
- with open(os.path.join(directory_path, filename), 'r') as file:
66
- content = file.read()
67
- md = md + content
68
-
69
- markdown_document = md
70
-
71
- headers_to_split_on = [
72
- ("#", "Header 1"),
73
- ("##", "Header 2"),
74
- ("###", "Header 3"),
75
- ]
76
-
77
- markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
78
- md_header_splits = markdown_splitter.split_text(markdown_document)
79
-
80
- embeddings = OpenAIEmbeddings()
81
- db = Chroma.from_documents(md_header_splits, embeddings)
82
-
83
- retriever = db.as_retriever(search_kwargs=dict(k=11))
84
-
85
- docs = retriever.get_relevant_documents(argument)
86
-
87
- output = "" # output to return, a list of common args
88
- seen = set() # which documents have been added to output
89
- count = 0 # count how many embeddings have been added to output
90
- for doc in docs:
91
- if doc.metadata["Header 1"] not in seen:
92
- output = output + doc.metadata["Header 1"] + '\n'
93
- count = count + 1
94
- seen.add(doc.metadata["Header 1"])
95
- if count >= max_args_in_output:
96
- break
97
-
98
- return output
99
-
100
-
101
-
102
- def get_gpt_response(argument, user_prompt, system_prompt=default_config.system_prompt, model=default_config.model_name, n=1, max_tokens=200):
103
- '''
104
- INPUT:
105
- Argument
106
- user_prompt
107
- system_prompt
108
- model
109
- '''
110
-
111
- @retry(wait=wait_random_exponential(min=1, max=3), stop=stop_after_attempt(1))
112
- def completion_with_backoff(**kwargs):
113
- return openai.ChatCompletion.create(**kwargs)
114
-
115
- messages=[
116
- {"role": "system", "content": system_prompt},
117
- {"role": "user", "content": user_prompt},
118
- ]
119
- responses = completion_with_backoff(
120
- model=model,
121
- messages=messages,
122
- n = n,
123
- max_tokens=max_tokens
124
- )
125
- for response in responses.choices:
126
- generation = response.message.content
127
- return generation
128
-
129
-
130
- def greet(argument):
131
- nearest_neighbor = find_nearest_neighbor(argument)
132
- user_prompt = default_config.user_prompt_1 + argument + default_config.user_prompt_2
133
- # response = get_gpt_response(argument, user_prompt)
134
- response = "chatbot response here"
135
- return "Hello " + "\n We think your argument matches common arguments in our database, is it one of these?:\n " + nearest_neighbor + "\n\n\n ------------------------- \n\n\n Lengthy response: \n" + response
136
-
137
-
138
- demo = gr.Interface(
139
- fn=greet,
140
- inputs=gr.Textbox(lines=2, placeholder="Anything past 200 tokens (roughly 200 words) will be cutoff. Please enter <=1 paragraph"),
141
- outputs="text"
142
- )
143
-
144
- # demo.queue(max_size=20)
145
- demo.launch()
146
-
147
-
148
-
149
-
150
-
151
-
152
-
153
-
154
-
155
-
156
-
157
-
158
-
159
-
160
- def find_nearest_neighbor(argument=""):
161
- '''
162
- INPUT:
163
- argument (string)
164
-
165
- RETURN the nearest neighbor(s) in vectorDB to argument as string
166
- '''
167
-
168
- md = ""
169
- directory_path = "../../safety_docs"
170
-
171
- for filename in os.listdir(directory_path):
172
- if filename.endswith(".md"):
173
- with open(os.path.join(directory_path, filename), 'r') as file:
174
- content = file.read()
175
- md = md + content
176
-
177
- markdown_document = md
178
- headers_to_split_on = [
179
- ("#", "Header 1"),
180
- ("##", "Header 2"),
181
- ("###", "Header 3"),
182
- ]
183
-
184
- markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
185
- md_header_splits = markdown_splitter.split_text(markdown_document)
186
-
187
- embeddings = OpenAIEmbeddings()
188
- db = Chroma.from_documents(md_header_splits, embeddings)
189
-
190
- retriever = db.as_retriever(search_kwargs=dict(k=11))
191
-
192
- docs = retriever.get_relevant_documents(argument)
193
-
194
- # return the content of the nearest neighbor document
195
- return docs[0].metadata["Header 1"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ingest.py DELETED
@@ -1,208 +0,0 @@
1
- """Ingest a directory of documentation files into a vector store and store the relevant artifacts in Weights & Biases"""
2
- import argparse
3
- import json
4
- import logging
5
- import os
6
- import pathlib
7
- from typing import List, Tuple
8
-
9
- import langchain
10
- import wandb
11
- from langchain.cache import SQLiteCache
12
- from langchain.docstore.document import Document
13
- from langchain.document_loaders import UnstructuredMarkdownLoader
14
- from langchain.embeddings import OpenAIEmbeddings
15
- from langchain.text_splitter import MarkdownTextSplitter
16
- from langchain.vectorstores import Chroma
17
-
18
- langchain.llm_cache = SQLiteCache(database_path="langchain.db")
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- def load_documents(data_dir: str) -> List[Document]:
24
- """Load documents from a directory of markdown files
25
-
26
- Args:
27
- data_dir (str): The directory containing the markdown files
28
-
29
- Returns:
30
- List[Document]: A list of documents
31
- """
32
- md_files = list(map(str, pathlib.Path(data_dir).glob("*.md")))
33
- documents = [
34
- UnstructuredMarkdownLoader(file_path=file_path).load()[0]
35
- for file_path in md_files
36
- ]
37
- return documents
38
-
39
-
40
- def chunk_documents(
41
- documents: List[Document], chunk_size: int = 500, chunk_overlap=0
42
- ) -> List[Document]:
43
- """Split documents into chunks
44
-
45
- Args:
46
- documents (List[Document]): A list of documents to split into chunks
47
- chunk_size (int, optional): The size of each chunk. Defaults to 500.
48
- chunk_overlap (int, optional): The number of tokens to overlap between chunks. Defaults to 0.
49
-
50
- Returns:
51
- List[Document]: A list of chunked documents.
52
- """
53
- markdown_text_splitter = MarkdownTextSplitter(
54
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
55
- )
56
- split_documents = markdown_text_splitter.split_documents(documents)
57
- return split_documents
58
-
59
-
60
- def create_vector_store(
61
- documents,
62
- vector_store_path: str = "./vector_store",
63
- ) -> Chroma:
64
- """Create a ChromaDB vector store from a list of documents
65
-
66
- Args:
67
- documents (_type_): A list of documents to add to the vector store
68
- vector_store_path (str, optional): The path to the vector store. Defaults to "./vector_store".
69
-
70
- Returns:
71
- Chroma: A ChromaDB vector store containing the documents.
72
- """
73
- api_key = os.environ.get("OPENAI_API_KEY", None)
74
- embedding_function = OpenAIEmbeddings(openai_api_key=api_key)
75
- vector_store = Chroma.from_documents(
76
- documents=documents,
77
- embedding=embedding_function,
78
- persist_directory=vector_store_path,
79
- )
80
- vector_store.persist()
81
- return vector_store
82
-
83
-
84
- def log_dataset(documents: List[Document], run: "wandb.run"):
85
- """Log a dataset to wandb
86
-
87
- Args:
88
- documents (List[Document]): A list of documents to log to a wandb artifact
89
- run (wandb.run): The wandb run to log the artifact to.
90
- """
91
- document_artifact = wandb.Artifact(name="documentation_dataset", type="dataset")
92
- with document_artifact.new_file("documents.json") as f:
93
- for document in documents:
94
- f.write(document.json() + "\n")
95
-
96
- run.log_artifact(document_artifact)
97
-
98
-
99
- def log_index(vector_store_dir: str, run: "wandb.run"):
100
- """Log a vector store to wandb
101
-
102
- Args:
103
- vector_store_dir (str): The directory containing the vector store to log
104
- run (wandb.run): The wandb run to log the artifact to.
105
- """
106
- index_artifact = wandb.Artifact(name="vector_store", type="search_index")
107
- index_artifact.add_dir(vector_store_dir)
108
- run.log_artifact(index_artifact)
109
-
110
-
111
- def log_prompt(prompt: dict, run: "wandb.run"):
112
- """Log a prompt to wandb
113
-
114
- Args:
115
- prompt (str): The prompt to log
116
- run (wandb.run): The wandb run to log the artifact to.
117
- """
118
- prompt_artifact = wandb.Artifact(name="chat_prompt", type="prompt")
119
- with prompt_artifact.new_file("prompt.json") as f:
120
- f.write(json.dumps(prompt))
121
- run.log_artifact(prompt_artifact)
122
-
123
-
124
- def ingest_data(
125
- docs_dir: str,
126
- chunk_size: int,
127
- chunk_overlap: int,
128
- vector_store_path: str,
129
- ) -> Tuple[List[Document], Chroma]:
130
- """Ingest a directory of markdown files into a vector store
131
-
132
- Args:
133
- docs_dir (str):
134
- chunk_size (int):
135
- chunk_overlap (int):
136
- vector_store_path (str):
137
-
138
-
139
- """
140
- # load the documents
141
- documents = load_documents(docs_dir)
142
- # split the documents into chunks
143
- split_documents = chunk_documents(documents, chunk_size, chunk_overlap)
144
- # create document embeddings and store them in a vector store
145
- vector_store = create_vector_store(split_documents, vector_store_path)
146
- return split_documents, vector_store
147
-
148
-
149
- def get_parser():
150
- parser = argparse.ArgumentParser()
151
- parser.add_argument(
152
- "--docs_dir",
153
- type=str,
154
- required=True,
155
- help="The directory containing the wandb documentation",
156
- )
157
- parser.add_argument(
158
- "--chunk_size",
159
- type=int,
160
- default=500,
161
- help="The number of tokens to include in each document chunk",
162
- )
163
- parser.add_argument(
164
- "--chunk_overlap",
165
- type=int,
166
- default=0,
167
- help="The number of tokens to overlap between document chunks",
168
- )
169
- parser.add_argument(
170
- "--vector_store",
171
- type=str,
172
- default="./vector_store",
173
- help="The directory to save or load the Chroma db to/from",
174
- )
175
- parser.add_argument(
176
- "--prompt_file",
177
- type=pathlib.Path,
178
- default="./chat_prompt.json",
179
- help="The path to the chat prompt to use",
180
- )
181
- parser.add_argument(
182
- "--wandb_project",
183
- default="llmapps",
184
- type=str,
185
- help="The wandb project to use for storing artifacts",
186
- )
187
-
188
- return parser
189
-
190
-
191
- def main():
192
- parser = get_parser()
193
- args = parser.parse_args()
194
- run = wandb.init(project=args.wandb_project, config=args)
195
- documents, vector_store = ingest_data(
196
- docs_dir=args.docs_dir,
197
- chunk_size=args.chunk_size,
198
- chunk_overlap=args.chunk_overlap,
199
- vector_store_path=args.vector_store,
200
- )
201
- log_dataset(documents, run)
202
- log_index(args.vector_store, run)
203
- log_prompt(json.load(args.prompt_file.open("r")), run)
204
- run.finish()
205
-
206
-
207
- if __name__ == "__main__":
208
- main()