ram36 commited on
Commit
cd38b19
·
verified ·
1 Parent(s): 2d46866

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -48
app.py CHANGED
@@ -1,22 +1,14 @@
1
- #pip install gradio
2
-
3
-
4
- #pip install datasets tqdm pandas matplotlib langchain sentence_transformers faiss-gpu langchain-community torch accelerate
5
-
6
  import gradio as gr
7
  import pandas as pd
8
- from tqdm.notebook import tqdm
9
- from datasets import Dataset
10
- import matplotlib.pyplot as plt
11
  from langchain.docstore.document import Document as LangchainDocument
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
- from sentence_transformers import SentenceTransformer
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
- from langchain.vectorstores import FAISS
16
  from langchain_community.embeddings import HuggingFaceEmbeddings
17
  from langchain_community.vectorstores.utils import DistanceStrategy
18
  import torch
19
-
20
 
21
  # Set display option for pandas
22
  pd.set_option("display.max_colwidth", None)
@@ -29,12 +21,11 @@ with open("iplteams_info.txt", "r") as fp1:
29
  with open("match_summaries_sentences.txt", "r") as fp2:
30
  content2 = fp2.read()
31
 
32
- # Open and read the second file
33
  with open("formatted_playersinfo.txt", "r") as fp3:
34
  content3 = fp3.read()
35
 
36
-
37
- # Combine contents of both files, separated by three newlines
38
  combined_content = content1 + "\n\n\n" + content2 + "\n\n\n" + content3
39
 
40
  # Split the combined content into sections
@@ -50,11 +41,7 @@ RAW_KNOWLEDGE_BASE = [
50
  for doc in tqdm(s)
51
  ]
52
 
53
-
54
- from langchain.text_splitter import RecursiveCharacterTextSplitter
55
- from sentence_transformers import SentenceTransformer
56
- from transformers import AutoTokenizer
57
-
58
  MARKDOWN_SEPARATORS = [
59
  "\n#{1,6}",
60
  "```\n",
@@ -87,11 +74,6 @@ fig.set_title("Histogram of Document Lengths")
87
  plt.title("Distribution")
88
  plt.show()
89
 
90
-
91
- from typing import Optional, List
92
- from langchain.text_splitter import RecursiveCharacterTextSplitter
93
- from transformers import AutoTokenizer
94
-
95
  EMBEDDING_MODEL_NAME = "thenlper/gte-small"
96
 
97
  def split_documents(
@@ -123,12 +105,6 @@ docs_processed = split_documents(512, RAW_KNOWLEDGE_BASE, tokenizer_name=EMBEDDI
123
  print(len(docs_processed))
124
  print(docs_processed[0:3])
125
 
126
-
127
- from langchain.vectorstores import FAISS
128
- from langchain_community.embeddings import HuggingFaceEmbeddings
129
- from langchain_community.vectorstores.utils import DistanceStrategy
130
- import torch
131
-
132
  print(torch.cuda.is_available())
133
 
134
  embedding_model = HuggingFaceEmbeddings(
@@ -144,10 +120,6 @@ KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
144
  distance_strategy=DistanceStrategy.COSINE,
145
  )
146
 
147
-
148
- import torch
149
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
150
-
151
  torch.random.manual_seed(0)
152
 
153
  model = AutoModelForCausalLM.from_pretrained(
@@ -171,7 +143,6 @@ generation_args = {
171
  "do_sample": False,
172
  }
173
 
174
-
175
  prompt_chat=[
176
  {
177
  "role":"system",
@@ -180,7 +151,6 @@ Give a comprehensive answer to the question.
180
  Respond only to the question asked , response should be concise and relevant to the question.
181
  provide the number of the source document when relevant.
182
  If the answer cannot be deduced from the context, do not give an answer""",
183
-
184
  },
185
  {
186
  "role":"user",
@@ -192,26 +162,24 @@ Question:{question}
192
  """,
193
  },
194
  ]
195
- RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
196
- prompt_chat,tokenize = False,add_generation_prompt=True,
197
 
 
 
198
  )
199
  print(RAG_PROMPT_TEMPLATE)
200
 
201
  u_query = "give the match summary of royal challengers bengaluru and mumbai indians in 2024"
202
- # ret_text = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query,k=3)
203
- retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query,k=3)
204
 
205
  context = retrieved_docs[0].page_content
206
  final_prompt = RAG_PROMPT_TEMPLATE.format(
207
- question= u_query, context = context
208
  )
209
 
210
  output = pipe(final_prompt, **generation_args)
211
- print("YOUR QUESTION:\n",u_query,"\n")
212
- print("MICROSOFT 128K ANSWER: \n",output[0]['generated_text'])
213
 
214
- # Define the function to handle queries
215
  def handle_query(question):
216
  retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=question, k=3)
217
  context = retrieved_docs[0].page_content
@@ -221,8 +189,6 @@ def handle_query(question):
221
  output = pipe(final_prompt, **generation_args)
222
  return output[0]['generated_text']
223
 
224
-
225
- # Create a Gradio interface
226
  interface = gr.Interface(
227
  fn=handle_query,
228
  inputs="text",
@@ -231,5 +197,4 @@ interface = gr.Interface(
231
  description="Get the match summary of IPL teams based on your query.",
232
  )
233
 
234
- interface.launch(share=True)
235
-
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
+ from tqdm import tqdm
 
 
4
  from langchain.docstore.document import Document as LangchainDocument
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
6
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
7
+ from langchain_community.vectorstores import FAISS
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores.utils import DistanceStrategy
10
  import torch
11
+ import matplotlib.pyplot as plt
12
 
13
  # Set display option for pandas
14
  pd.set_option("display.max_colwidth", None)
 
21
  with open("match_summaries_sentences.txt", "r") as fp2:
22
  content2 = fp2.read()
23
 
24
+ # Open and read the third file
25
  with open("formatted_playersinfo.txt", "r") as fp3:
26
  content3 = fp3.read()
27
 
28
+ # Combine contents of all files, separated by three newlines
 
29
  combined_content = content1 + "\n\n\n" + content2 + "\n\n\n" + content3
30
 
31
  # Split the combined content into sections
 
41
  for doc in tqdm(s)
42
  ]
43
 
44
+ # Define markdown separators
 
 
 
 
45
  MARKDOWN_SEPARATORS = [
46
  "\n#{1,6}",
47
  "```\n",
 
74
  plt.title("Distribution")
75
  plt.show()
76
 
 
 
 
 
 
77
  EMBEDDING_MODEL_NAME = "thenlper/gte-small"
78
 
79
  def split_documents(
 
105
  print(len(docs_processed))
106
  print(docs_processed[0:3])
107
 
 
 
 
 
 
 
108
  print(torch.cuda.is_available())
109
 
110
  embedding_model = HuggingFaceEmbeddings(
 
120
  distance_strategy=DistanceStrategy.COSINE,
121
  )
122
 
 
 
 
 
123
  torch.random.manual_seed(0)
124
 
125
  model = AutoModelForCausalLM.from_pretrained(
 
143
  "do_sample": False,
144
  }
145
 
 
146
  prompt_chat=[
147
  {
148
  "role":"system",
 
151
  Respond only to the question asked , response should be concise and relevant to the question.
152
  provide the number of the source document when relevant.
153
  If the answer cannot be deduced from the context, do not give an answer""",
 
154
  },
155
  {
156
  "role":"user",
 
162
  """,
163
  },
164
  ]
 
 
165
 
166
+ RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
167
+ prompt_chat, tokenize=False, add_generation_prompt=True,
168
  )
169
  print(RAG_PROMPT_TEMPLATE)
170
 
171
  u_query = "give the match summary of royal challengers bengaluru and mumbai indians in 2024"
172
+ retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query, k=3)
 
173
 
174
  context = retrieved_docs[0].page_content
175
  final_prompt = RAG_PROMPT_TEMPLATE.format(
176
+ question=u_query, context=context
177
  )
178
 
179
  output = pipe(final_prompt, **generation_args)
180
+ print("YOUR QUESTION:\n", u_query, "\n")
181
+ print("MICROSOFT 128K ANSWER: \n", output[0]['generated_text'])
182
 
 
183
  def handle_query(question):
184
  retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=question, k=3)
185
  context = retrieved_docs[0].page_content
 
189
  output = pipe(final_prompt, **generation_args)
190
  return output[0]['generated_text']
191
 
 
 
192
  interface = gr.Interface(
193
  fn=handle_query,
194
  inputs="text",
 
197
  description="Get the match summary of IPL teams based on your query.",
198
  )
199
 
200
+ interface.launch(sharing=True)