Hariprasad Navilur N commited on
Commit
d372dcc
·
1 Parent(s): e0f2ca2

global vars fixed

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -0
  2. app.py +19 -28
Dockerfile CHANGED
@@ -7,4 +7,6 @@ RUN pip install --no-cache-dir --upgrade pip && pip install -r requirements.txt
7
 
8
  COPY . .
9
 
 
 
10
  CMD ["gunicorn", "app:app", "-b", "0.0.0.0:7860"]
 
7
 
8
  COPY . .
9
 
10
+ EXPOSE 7860:7860
11
+
12
  CMD ["gunicorn", "app:app", "-b", "0.0.0.0:7860"]
app.py CHANGED
@@ -12,9 +12,7 @@ from flask_cors import CORS
12
  from groq import Groq
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain_community.document_loaders import PyPDFLoader
15
- from langchain_community.embeddings.sentence_transformer import (
16
- SentenceTransformerEmbeddings
17
- )
18
  from langchain_community.vectorstores import Chroma
19
 
20
  logging.basicConfig(
@@ -26,6 +24,10 @@ logging.basicConfig(
26
  )
27
  logger = logging.getLogger(__name__)
28
 
 
 
 
 
29
  app = Flask(__name__)
30
  CORS(app)
31
 
@@ -77,9 +79,7 @@ def ingest_documents(pdf_folder_location, tenant_id=None, policy_set_id=None, do
77
 
78
  len(compliance_chunks)
79
 
80
- compliance_collection = 'compliance_collection'
81
-
82
- embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
83
 
84
  vectorstore = Chroma(
85
  collection_name=compliance_collection,
@@ -97,29 +97,15 @@ def ingest_documents(pdf_folder_location, tenant_id=None, policy_set_id=None, do
97
 
98
  def callLlm(data):
99
  import os
 
100
  # Loading the Chroma DB and using the retriever to retreive the chunks just for testing
101
 
102
- compliance_collection = 'compliance_collection'
103
- embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
104
-
105
- vectorstore_persisted = Chroma(
106
- collection_name=compliance_collection,
107
- persist_directory='./compliance_db',
108
- embedding_function=embedding_model
109
- )
110
-
111
  transcript = data["transcript"]
112
  combined_text = " ".join(turn["content"] for turn in transcript if "content" in turn)
113
 
114
  client = Groq()
115
  model_name = 'openai/gpt-oss-20b'
116
 
117
- #
118
- # retriever = vectorstore_persisted.as_retriever(
119
- # search_type='similarity',
120
- # search_kwargs={'k': 5}
121
- # )
122
-
123
  # # Original cell: 26E1QcvAR-OO
124
  # # Retrieve the first two chunks from the vector store
125
  # retrieved_data = vectorstore_persisted.get(
@@ -157,15 +143,19 @@ def callLlm(data):
157
  {transcript}
158
  """
159
 
160
- # # Original cell: MUBRJsi12e59
161
- # relevant_document_chunks = retriever.get_relevant_documents(combined_text)
162
-
 
163
  tenant_id = data["tenant_id"]
164
- relevant_document_chunks = vectorstore_persisted.similarity_search(combined_text, k=3,
165
- filter={"tenant_id": tenant_id})
 
 
166
 
167
  len(relevant_document_chunks)
168
 
 
169
  for document in relevant_document_chunks:
170
  logger.info(document.page_content.replace("\t", " "))
171
  break
@@ -182,7 +172,7 @@ def callLlm(data):
182
  }
183
  ]
184
 
185
- logger.info(prompt)
186
 
187
  try:
188
  response = client.chat.completions.create(
@@ -200,10 +190,11 @@ def callLlm(data):
200
 
201
  if __name__ == '__main__':
202
  import os
 
203
 
204
  # todo: list all policy documents and ingest them once
205
  pdf_folder_location = "Bank_Contact_Center_Compliance_Policies.pdf"
206
- ingest_documents(
207
  pdf_folder_location=pdf_folder_location,
208
  tenant_id="tenant_123",
209
  policy_set_id="policy_set_abc",
 
12
  from groq import Groq
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain_community.document_loaders import PyPDFLoader
15
+ from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
 
 
16
  from langchain_community.vectorstores import Chroma
17
 
18
  logging.basicConfig(
 
24
  )
25
  logger = logging.getLogger(__name__)
26
 
27
+ compliance_collection = 'compliance_collection'
28
+ embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
29
+ vectorstore = None
30
+
31
  app = Flask(__name__)
32
  CORS(app)
33
 
 
79
 
80
  len(compliance_chunks)
81
 
82
+ os.environ["CHROMA_TELEMETRY"] = "FALSE"
 
 
83
 
84
  vectorstore = Chroma(
85
  collection_name=compliance_collection,
 
97
 
98
  def callLlm(data):
99
  import os
100
+ global compliance_collection, embedding_model, vectorstore
101
  # Loading the Chroma DB and using the retriever to retreive the chunks just for testing
102
 
 
 
 
 
 
 
 
 
 
103
  transcript = data["transcript"]
104
  combined_text = " ".join(turn["content"] for turn in transcript if "content" in turn)
105
 
106
  client = Groq()
107
  model_name = 'openai/gpt-oss-20b'
108
 
 
 
 
 
 
 
109
  # # Original cell: 26E1QcvAR-OO
110
  # # Retrieve the first two chunks from the vector store
111
  # retrieved_data = vectorstore_persisted.get(
 
143
  {transcript}
144
  """
145
 
146
+ retriever = vectorstore.as_retriever(
147
+ search_type='similarity',
148
+ search_kwargs={'k': 5}
149
+ )
150
  tenant_id = data["tenant_id"]
151
+ relevant_document_chunks = retriever.get_relevant_documents(combined_text, metadata = {"tenant_id": tenant_id})
152
+
153
+ # relevant_document_chunks = vectorstore_persisted.similarity_search(combined_text, k=3,
154
+ # filter={"tenant_id": tenant_id})
155
 
156
  len(relevant_document_chunks)
157
 
158
+ logger.info("relevent chunks: ")
159
  for document in relevant_document_chunks:
160
  logger.info(document.page_content.replace("\t", " "))
161
  break
 
172
  }
173
  ]
174
 
175
+ logger.info("prompt: " + prompt)
176
 
177
  try:
178
  response = client.chat.completions.create(
 
190
 
191
  if __name__ == '__main__':
192
  import os
193
+ global vectorstore
194
 
195
  # todo: list all policy documents and ingest them once
196
  pdf_folder_location = "Bank_Contact_Center_Compliance_Policies.pdf"
197
+ vectorstore = ingest_documents(
198
  pdf_folder_location=pdf_folder_location,
199
  tenant_id="tenant_123",
200
  policy_set_id="policy_set_abc",