Thomas Richardson commited on
Commit
ac40df2
·
unverified ·
2 Parent(s): 5599856a10afa8

Merge pull request #76 from ttt246/feature/training

Browse files
Brain/src/common/utils.py CHANGED
@@ -30,8 +30,8 @@ DEFAULT_GPT_MODEL = "gpt-4"
30
  AGENT_NAME = "RisingBrain Assistant"
31
 
32
  # indexes of relatedness of embedding
33
- COMMAND_SMS_INDEXS = [4, 5]
34
- COMMAND_BROWSER_OPEN = [10]
35
 
36
  # Twilio
37
  ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
@@ -77,6 +77,7 @@ def parseJsonFromCompletion(data: str) -> json:
77
  result = result.replace("': '", '": "')
78
  result = result.replace("': \\\"", '": \"')
79
  result = result.replace("', '", '", "')
 
80
 
81
  substring = '\\"}'
82
  replacement = '\"}'
@@ -86,8 +87,10 @@ def parseJsonFromCompletion(data: str) -> json:
86
  if index == len(result) - 3:
87
  result = result[:index] + replacement + result[index + len(substring):]
88
  # fmt: on
89
- result = json.loads(result.replace("':", '":'))
90
- return result
 
 
91
 
92
 
93
  def parseUrlFromStr(text: str) -> str:
 
30
  AGENT_NAME = "RisingBrain Assistant"
31
 
32
  # indexes of relatedness of embedding
33
+ COMMAND_SMS_INDEXES = ["pWDrks5DO1bEPLlUtQ1f", "LEpAhmFi8tAOQUE7LHZZ"] # 4, 5
34
+ COMMAND_BROWSER_OPEN = ["taVNeDINonUqJWXBlESU"] # 10
35
 
36
  # Twilio
37
  ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
 
77
  result = result.replace("': '", '": "')
78
  result = result.replace("': \\\"", '": \"')
79
  result = result.replace("', '", '", "')
80
+ result = result.replace("':", '":')
81
 
82
  substring = '\\"}'
83
  replacement = '\"}'
 
87
  if index == len(result) - 3:
88
  result = result[:index] + replacement + result[index + len(substring):]
89
  # fmt: on
90
+ try:
91
+ return json.loads(result)
92
+ except Exception as e:
93
+ return result
94
 
95
 
96
  def parseUrlFromStr(text: str) -> str:
Brain/src/model/requests/request_model.py CHANGED
@@ -130,6 +130,14 @@ class TrainContacts(BasicReq):
130
  contacts: list[ContactReq]
131
 
132
 
 
 
 
 
 
 
 
 
133
  """endpoint /browser/item"""
134
 
135
 
@@ -140,3 +148,16 @@ class BrowserItem(BasicReq):
140
 
141
  items: list[ItemReq]
142
  prompt: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  contacts: list[ContactReq]
131
 
132
 
133
+ """endpoint: /document"""
134
+
135
+
136
+ class Document(BasicReq):
137
+ document_id: str
138
+ page_content: str
139
+
140
+
141
  """endpoint /browser/item"""
142
 
143
 
 
148
 
149
  items: list[ItemReq]
150
  prompt: str
151
+
152
+
153
+ """endpoint /train"""
154
+
155
+
156
+ class Train(BasicReq):
157
+ class TrainData(BaseModel):
158
+ page_content: str
159
+ timestamp: float
160
+
161
+ id: str
162
+ data: TrainData
163
+ status: str
Brain/src/model/train_model.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """train model:
2
+ {
3
+ "id": "String",
4
+ "data": [{"page_content": "String", "timestamp": 0}],
5
+ "status": "created | updated | deleted",
6
+ }"""
7
+
8
+ from Brain.src.model.requests.request_model import Train
9
+
10
+
11
+ class TrainModel:
12
+ def __init__(self, train_data: Train):
13
+ self.id = train_data.id
14
+ self.data = train_data.data
15
+ self.status = TrainStatus.UPDATED
16
+
17
+
18
+ """train status: created | updated | deleted"""
19
+
20
+
21
+ class TrainStatus:
22
+ CREATED = "created"
23
+ UPDATED = "updated"
24
+ DELETED = "deleted"
Brain/src/rising_plugin/guardrails-config/actions/actions.py CHANGED
@@ -13,24 +13,21 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
16
- import os
17
  import json
18
- import numpy as np
19
 
20
- from langchain.embeddings.openai import OpenAIEmbeddings
21
- from langchain.vectorstores import utils
22
- from langchain.document_loaders.csv_loader import CSVLoader
23
  from langchain.docstore.document import Document
24
 
25
  from Brain.src.common.brain_exception import BrainException
26
  from Brain.src.common.utils import (
27
- OPENAI_API_KEY,
28
- COMMAND_SMS_INDEXS,
29
  COMMAND_BROWSER_OPEN,
 
30
  )
31
  from Brain.src.rising_plugin.image_embedding import (
32
  query_image_text,
33
  )
 
34
 
35
  from nemoguardrails.actions import action
36
 
@@ -44,50 +41,44 @@ from Brain.src.rising_plugin.llm.llms import (
44
  FALCON_7B,
45
  )
46
 
47
- """
48
- query is json string with below format
49
- {
50
- "query": string,
51
- "model": string,
52
- "uuid": string,
53
- "image_search": bool,
54
- }
55
- """
56
 
57
 
58
  @action()
59
  async def general_question(query):
60
  """step 0: convert string to json"""
 
 
61
  try:
62
  json_query = json.loads(query)
63
  except Exception as ex:
64
  raise BrainException(BrainException.JSON_PARSING_ISSUE_MSG)
65
- """step 0-->: parsing parms from the json query"""
66
  query = json_query["query"]
67
  model = json_query["model"]
68
  uuid = json_query["uuid"]
69
  image_search = json_query["image_search"]
70
 
71
  """step 1: handle with gpt-4"""
72
- file_path = os.path.dirname(os.path.abspath(__file__))
73
-
74
- with open(f"{file_path}/phone.json", "r") as infile:
75
- data = json.load(infile)
76
- embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
77
-
78
- query_result = embeddings.embed_query(query)
79
- doc_list = utils.maximal_marginal_relevance(np.array(query_result), data, k=1)
80
- loader = CSVLoader(file_path=f"{file_path}/phone.csv", encoding="utf8")
81
- csv_text = loader.load()
82
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  docs = []
84
-
85
- for res in doc_list:
86
- docs.append(
87
- Document(
88
- page_content=csv_text[res].page_content, metadata=csv_text[res].metadata
89
- )
90
- )
91
 
92
  chain_data = get_llm_chain(model=model).run(input_documents=docs, question=query)
93
  # test
@@ -115,8 +106,8 @@ async def general_question(query):
115
  return str(result)
116
  except ValueError as e:
117
  # Check sms and browser query
118
- if doc_list[0] in COMMAND_SMS_INDEXS:
119
  return str({"program": "sms", "content": chain_data})
120
- elif doc_list[0] in COMMAND_BROWSER_OPEN:
121
  return str({"program": "browser", "content": "https://google.com"})
122
  return str({"program": "message", "content": falcon_llm.query(question=query)})
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
 
16
  import json
 
17
 
18
+ from Brain.src.service.train_service import TrainService
 
 
19
  from langchain.docstore.document import Document
20
 
21
  from Brain.src.common.brain_exception import BrainException
22
  from Brain.src.common.utils import (
23
+ COMMAND_SMS_INDEXES,
 
24
  COMMAND_BROWSER_OPEN,
25
+ PINECONE_INDEX_NAME,
26
  )
27
  from Brain.src.rising_plugin.image_embedding import (
28
  query_image_text,
29
  )
30
+ from Brain.src.rising_plugin.csv_embed import get_embed
31
 
32
  from nemoguardrails.actions import action
33
 
 
41
  FALCON_7B,
42
  )
43
 
44
+ from Brain.src.rising_plugin.pinecone_engine import (
45
+ get_pinecone_index_namespace,
46
+ init_pinecone,
47
+ )
 
 
 
 
 
48
 
49
 
50
  @action()
51
  async def general_question(query):
52
  """step 0: convert string to json"""
53
+ index = init_pinecone(PINECONE_INDEX_NAME)
54
+ train_service = TrainService()
55
  try:
56
  json_query = json.loads(query)
57
  except Exception as ex:
58
  raise BrainException(BrainException.JSON_PARSING_ISSUE_MSG)
59
+ """step 0-->: parsing params from the json query"""
60
  query = json_query["query"]
61
  model = json_query["model"]
62
  uuid = json_query["uuid"]
63
  image_search = json_query["image_search"]
64
 
65
  """step 1: handle with gpt-4"""
 
 
 
 
 
 
 
 
 
 
66
 
67
+ query_result = get_embed(query)
68
+ relatedness_data = index.query(
69
+ vector=query_result,
70
+ top_k=1,
71
+ include_values=False,
72
+ namespace=train_service.get_pinecone_index_train_namespace(),
73
+ )
74
+ documentId = ""
75
+ if len(relatedness_data["matches"]) > 0:
76
+ documentId = relatedness_data["matches"][0]["id"]
77
+ else:
78
+ return None
79
  docs = []
80
+ document = train_service.read_one_document(documentId)
81
+ docs.append(Document(page_content=document["page_content"], metadata=""))
 
 
 
 
 
82
 
83
  chain_data = get_llm_chain(model=model).run(input_documents=docs, question=query)
84
  # test
 
106
  return str(result)
107
  except ValueError as e:
108
  # Check sms and browser query
109
+ if documentId in COMMAND_SMS_INDEXES:
110
  return str({"program": "sms", "content": chain_data})
111
+ elif documentId in COMMAND_BROWSER_OPEN:
112
  return str({"program": "browser", "content": "https://google.com"})
113
  return str({"program": "message", "content": falcon_llm.query(question=query)})
Brain/src/router/api.py CHANGED
@@ -24,7 +24,6 @@ from Brain.src.rising_plugin.risingplugin import (
24
  handle_chat_completion,
25
  )
26
  from Brain.src.firebase.cloudmessage import send_message, get_tokens
27
- from Brain.src.rising_plugin.csv_embed import csv_embed
28
  from Brain.src.rising_plugin.image_embedding import embed_image_text, query_image_text
29
 
30
  from Brain.src.logs import logger
@@ -162,16 +161,6 @@ def construct_blueprint_api() -> APIRouter:
162
  result={"program": "image", "content": image_response},
163
  )
164
 
165
- """@generator.response(
166
- status_code=200, schema={"message": "message", "result": "test_result"}
167
- )"""
168
-
169
- @router.get("/training")
170
- def csv_training():
171
- csv_embed()
172
-
173
- return assembler.to_response(200, "trained successfully", "")
174
-
175
  """@generator.request_body(
176
  {
177
  "token": "test_token",
 
24
  handle_chat_completion,
25
  )
26
  from Brain.src.firebase.cloudmessage import send_message, get_tokens
 
27
  from Brain.src.rising_plugin.image_embedding import embed_image_text, query_image_text
28
 
29
  from Brain.src.logs import logger
 
161
  result={"program": "image", "content": image_response},
162
  )
163
 
 
 
 
 
 
 
 
 
 
 
164
  """@generator.request_body(
165
  {
166
  "token": "test_token",
Brain/src/router/train_router.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from Brain.src.common.assembler import Assembler
4
+ from Brain.src.model.requests.request_model import (
5
+ Document,
6
+ )
7
+ from Brain.src.service.train_service import TrainService
8
+
9
+ router = APIRouter()
10
+
11
+
12
+ def construct_blueprint_train_api() -> APIRouter:
13
+ # Assembler
14
+ assembler = Assembler()
15
+
16
+ # Services
17
+ train_service = TrainService()
18
+
19
+ """@generator.response(
20
+ status_code=200, schema={"message": "message", "result": "test_result"}
21
+ )"""
22
+
23
+ @router.get("")
24
+ def read_all_documents():
25
+ try:
26
+ result = train_service.read_all_documents()
27
+ except Exception as e:
28
+ return assembler.to_response(400, "failed to get all documents", "")
29
+ return assembler.to_response(200, "Get all documents list successfully", result)
30
+
31
+ """@generator.response( status_code=200, schema={"message": "message", "result": {"document_id": "document_id",
32
+ "page_content":"page_content"}} )"""
33
+
34
+ @router.get("/all")
35
+ def train_all_documents():
36
+ try:
37
+ result = train_service.train_all_documents()
38
+ except Exception as e:
39
+ return assembler.to_response(400, "failed to get all documents", "")
40
+ return assembler.to_response(200, "Get all documents list successfully", result)
41
+
42
+ """@generator.response( status_code=200, schema={"message": "message", "result": {"document_id": "document_id",
43
+ "page_content":"page_content"}} )"""
44
+
45
+ @router.get("/{document_id}")
46
+ def read_one_document(document_id: str):
47
+ if document_id != "all":
48
+ try:
49
+ result = train_service.read_one_document(document_id)
50
+ except Exception as e:
51
+ return assembler.to_response(400, "fail to get one document", "")
52
+ return assembler.to_response(200, "Get one document successfully", result)
53
+
54
+ """@generator.request_body(
55
+ {
56
+ "token": "test_token",
57
+ "uuid": "test_uuid",
58
+ "page_content": "string",
59
+ }
60
+ )
61
+ @generator.response( status_code=200, schema={"message": "message", "result": {"document_id": "document_id",
62
+ "page_content":"page_content"}} )"""
63
+
64
+ @router.post("")
65
+ def create_document_train(data: Document):
66
+ try:
67
+ result = train_service.create_one_document(data.page_content)
68
+ except Exception as e:
69
+ return assembler.to_response(400, "failed to create one document", "")
70
+ return assembler.to_response(
71
+ 200, "created one document and trained it successfully", result
72
+ )
73
+
74
+ """@generator.request_body(
75
+ {
76
+ "token": "test_token",
77
+ "uuid": "test_uuid",
78
+ "document_id": "string",
79
+ "page_content": "string",
80
+ }
81
+ )
82
+ @generator.response( status_code=200, schema={"message": "message", "result": {"document_id": "document_id",
83
+ "page_content":"page_content"}} )"""
84
+
85
+ @router.put("")
86
+ def update_one_document(data: Document):
87
+ try:
88
+ result = train_service.update_one_document(
89
+ data.document_id, data.page_content
90
+ )
91
+ except Exception as e:
92
+ return assembler.to_response(400, "fail to update one document", "")
93
+ return assembler.to_response(
94
+ 200, "updated one document and trained it successfully", result
95
+ )
96
+
97
+ """@generator.request_body(
98
+ {
99
+ "token": "test_token",
100
+ "uuid": "test_uuid",
101
+ "document_id": "string",
102
+ }
103
+ )
104
+ @generator.response( status_code=200, schema={"message": "message", "result": {"document_id": "document_id"}} )"""
105
+
106
+ @router.delete("/{document_id}")
107
+ def delete_one_document(document_id: str):
108
+ try:
109
+ result = train_service.delete_one_document(document_id)
110
+ except Exception as e:
111
+ return assembler.to_response(400, "fail to delete one train", "")
112
+ return assembler.to_response(
113
+ 200, "deleted one document and train data successfully", result
114
+ )
115
+
116
+ return router
Brain/src/service/train_service.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """service to manage trains"""
2
+ from typing import List, Any
3
+
4
+ from Brain.src.rising_plugin.csv_embed import get_embed
5
+ from Brain.src.rising_plugin.pinecone_engine import (
6
+ get_pinecone_index_namespace,
7
+ update_pinecone,
8
+ init_pinecone,
9
+ delete_pinecone,
10
+ add_pinecone,
11
+ delete_all_pinecone,
12
+ )
13
+
14
+ from firebase_admin import firestore
15
+ import datetime
16
+
17
+
18
+ def to_json(page_content: str):
19
+ return {
20
+ "page_content": page_content,
21
+ "timestamp": datetime.datetime.now().timestamp(),
22
+ }
23
+
24
+
25
+ class TrainService:
26
+ """train (getting embedding) and update pinecone with embeddings by document_id
27
+ train datatype:
28
+ key: document_id
29
+ values: {page_content}"""
30
+
31
+ def __init__(self):
32
+ self.db = firestore.client()
33
+ self.documents_ref = self.db.collection("documents")
34
+
35
+ """read all documents from firestore"""
36
+
37
+ def read_all_documents(self):
38
+ query = self.documents_ref.order_by("timestamp")
39
+ docs = query.stream()
40
+ result = []
41
+ for item in docs:
42
+ item_data = item.to_dict()
43
+ result.append(
44
+ {"document_id": item.id, "page_content": item_data["page_content"]}
45
+ )
46
+ return result
47
+
48
+ """read one document from firestore"""
49
+
50
+ def read_one_document(self, document_id: str):
51
+ doc = self.documents_ref.document(document_id).get()
52
+ if doc.exists:
53
+ return {
54
+ "document_id": document_id,
55
+ "page_content": doc.to_dict()["page_content"],
56
+ }
57
+ else:
58
+ return None
59
+
60
+ """create a new document and train it"""
61
+
62
+ def create_one_document(self, page_content: str):
63
+ # Auto-generate document ID
64
+ auto_generated_doc_ref = self.documents_ref.document()
65
+ auto_generated_doc_ref.set(to_json(page_content))
66
+ auto_generated_document_id = auto_generated_doc_ref.id
67
+ self.train_one_document(auto_generated_document_id, page_content)
68
+ return {"document_id": auto_generated_document_id, "page_content": page_content}
69
+
70
+ """update a document by using id and train it"""
71
+
72
+ def update_one_document(self, document_id: str, page_content: str):
73
+ self.documents_ref.document(document_id).update(to_json(page_content))
74
+ self.train_one_document(document_id, page_content)
75
+ return {"document_id": document_id, "page_content": page_content}
76
+
77
+ """delete a document by using document_id"""
78
+
79
+ def delete_one_document(self, document_id: str):
80
+ self.documents_ref.document(document_id).delete()
81
+ self.delete_one_pinecone(document_id)
82
+ return {"document_id": document_id}
83
+
84
+ def train_all_documents(self) -> str:
85
+ self.delete_all()
86
+ documents = self.read_all_documents()
87
+ result = list()
88
+ pinecone_namespace = self.get_pinecone_index_namespace()
89
+ for item in documents:
90
+ query_result = get_embed(item["page_content"])
91
+ result.append(query_result)
92
+ key = item["document_id"]
93
+ value = f'{item["page_content"]}'
94
+ # get vectoring data(embedding data)
95
+ vectoring_values = get_embed(value)
96
+ add_pinecone(namespace=pinecone_namespace, key=key, value=vectoring_values)
97
+
98
+ return "trained all documents successfully"
99
+
100
+ def train_one_document(self, document_id: str, page_content: str) -> None:
101
+ pinecone_namespace = self.get_pinecone_index_namespace()
102
+ result = list()
103
+ query_result = get_embed(page_content)
104
+ result.append(query_result)
105
+ key = document_id
106
+ value = f"{page_content}, {query_result}"
107
+ # get vectoring data(embedding data)
108
+ vectoring_values = get_embed(value)
109
+ add_pinecone(namespace=pinecone_namespace, key=key, value=vectoring_values)
110
+
111
+ def delete_all(self) -> Any:
112
+ return delete_all_pinecone(self.get_pinecone_index_namespace())
113
+
114
+ def delete_one_pinecone(self, document_id: str) -> Any:
115
+ return delete_pinecone(self.get_pinecone_index_namespace(), document_id)
116
+
117
+ def get_pinecone_index_namespace(self) -> str:
118
+ return get_pinecone_index_namespace(f"trains")
119
+
120
+ def get_pinecone_index_train_namespace(self) -> str:
121
+ return get_pinecone_index_namespace(f"trains")
app.py CHANGED
@@ -3,6 +3,7 @@ from fastapi import Depends, FastAPI
3
  import uvicorn
4
 
5
  from Brain.src.router.browser_router import construct_blueprint_browser_api
 
6
 
7
  initialize_app()
8
 
@@ -14,6 +15,10 @@ app.include_router(
14
  construct_blueprint_browser_api(), prefix="/browser", tags=["ai_browser"]
15
  )
16
 
 
 
 
 
17
 
18
  if __name__ == "__main__":
19
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
3
  import uvicorn
4
 
5
  from Brain.src.router.browser_router import construct_blueprint_browser_api
6
+ from Brain.src.router.train_router import construct_blueprint_train_api
7
 
8
  initialize_app()
9
 
 
15
  construct_blueprint_browser_api(), prefix="/browser", tags=["ai_browser"]
16
  )
17
 
18
+ app.include_router(
19
+ construct_blueprint_train_api(), prefix="/train", tags=["ai_train"]
20
+ )
21
+
22
 
23
  if __name__ == "__main__":
24
  uvicorn.run(app, host="0.0.0.0", port=7860)