File size: 5,895 Bytes
9caba53 2762a28 cd38b19 2762a28 cd38b19 2762a28 cd38b19 a6aba6c 2762a28 2d46866 2762a28 2d46866 2762a28 cd38b19 2d46866 2762a28 cd38b19 2762a28 cd38b19 2762a28 cd38b19 2762a28 cd38b19 2762a28 cd38b19 2762a28 cd38b19 2762a28 cd38b19 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import gradio as gr
import pandas as pd
from tqdm import tqdm
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import torch
import matplotlib.pyplot as plt
from typing import Optional, List
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
# Rest of your code remains unchanged
# Set display option for pandas
pd.set_option("display.max_colwidth", None)
# Open and read the first file
with open("iplteams_info.txt", "r") as fp1:
content1 = fp1.read()
# Open and read the second file
with open("match_summaries_sentences.txt", "r") as fp2:
content2 = fp2.read()
# Open and read the third file
with open("formatted_playersinfo.txt", "r") as fp3:
content3 = fp3.read()
# Combine contents of all files, separated by three newlines
combined_content = content1 + "\n\n\n" + content2 + "\n\n\n" + content3
# Split the combined content into sections
s = combined_content.split("\n\n\n")
# Print the first section and the number of sections
print(s[0])
print(len(s))
# Create a RAW_KNOWLEDGE_BASE using LangchainDocument
RAW_KNOWLEDGE_BASE = [
LangchainDocument(page_content=doc)
for doc in tqdm(s)
]
# Define markdown separators
MARKDOWN_SEPARATORS = [
"\n#{1,6}",
"```\n",
"\n\\*\\*\\*+\n",
"\n---+\n",
"\n__+\n",
"\n\n",
"\n",
" ",
""
]
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
add_start_index=True,
strip_whitespace=True,
separators=MARKDOWN_SEPARATORS,
)
docs_processed = []
for doc in RAW_KNOWLEDGE_BASE:
docs_processed += text_splitter.split_documents([doc])
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
fig = pd.Series(lengths).hist()
fig.set_title("Histogram of Document Lengths")
plt.title("Distribution")
plt.show()
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
def split_documents(
chunk_size: int,
knowledge_base: list[LangchainDocument],
tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
AutoTokenizer.from_pretrained(tokenizer_name),
chunk_size=chunk_size,
chunk_overlap=int(chunk_size / 10),
add_start_index=True,
strip_whitespace=True,
separators=MARKDOWN_SEPARATORS,
)
docs_processed = []
for doc in knowledge_base:
docs_processed += text_splitter.split_documents([doc])
unique_texts = {}
docs_processed_unique = []
for doc in docs_processed:
if doc.page_content not in unique_texts:
unique_texts[doc.page_content] = True
docs_processed_unique.append(doc)
return docs_processed_unique
docs_processed = split_documents(512, RAW_KNOWLEDGE_BASE, tokenizer_name=EMBEDDING_MODEL_NAME)
print(len(docs_processed))
print(docs_processed[0:3])
print(torch.cuda.is_available())
embedding_model = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL_NAME,
multi_process=True,
model_kwargs={"device": "cuda"},
encode_kwargs={"normalize_embeddings": True},
)
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
docs_processed,
embedding_model,
distance_strategy=DistanceStrategy.COSINE,
)
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-128k-instruct",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
generation_args = {
"max_new_tokens": 500,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
prompt_chat=[
{
"role":"system",
"content":"""Using the information contained in the context,
Give a comprehensive answer to the question.
Respond only to the question asked , response should be concise and relevant to the question.
provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer""",
},
{
"role":"user",
"content":"""Context:
{context}
---
Now here is the Question you need to answer.
Question:{question}
""",
},
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
prompt_chat, tokenize=False, add_generation_prompt=True,
)
print(RAG_PROMPT_TEMPLATE)
u_query = "give the match summary of royal challengers bengaluru and mumbai indians in 2024"
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query, k=3)
context = retrieved_docs[0].page_content
final_prompt = RAG_PROMPT_TEMPLATE.format(
question=u_query, context=context
)
output = pipe(final_prompt, **generation_args)
print("YOUR QUESTION:\n", u_query, "\n")
print("MICROSOFT 128K ANSWER: \n", output[0]['generated_text'])
def handle_query(question):
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=question, k=3)
context = retrieved_docs[0].page_content
final_prompt = RAG_PROMPT_TEMPLATE.format(
question=question, context=context
)
output = pipe(final_prompt, **generation_args)
return output[0]['generated_text']
interface = gr.Interface(
fn=handle_query,
inputs="text",
outputs="text",
title="IPL Match Summary Generator",
description="Get the match summary of IPL teams based on your query.",
)
interface.launch(sharing=True)
|