Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
| 3 |
+
from langchain.docstore.document import Document as LangchainDocument
|
| 4 |
+
from langchain.vectorstores import FAISS
|
| 5 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 6 |
+
from langchain_community.vectorstores.utils import DistanceStrategy
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
|
| 12 |
+
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
|
| 13 |
+
MODEL_NAME = "microsoft/Phi-3-mini-128k-instruct"
|
| 14 |
+
|
| 15 |
+
# Set display option for pandas
|
| 16 |
+
pd.set_option("display.max_colwidth", None)
|
| 17 |
+
|
| 18 |
+
# Load and read the datasets
|
| 19 |
+
with open("iplteams_info.txt", "r") as fp1:
|
| 20 |
+
content1 = fp1.read()
|
| 21 |
+
|
| 22 |
+
with open("match_summaries_sentences.txt", "r") as fp2:
|
| 23 |
+
content2 = fp2.read()
|
| 24 |
+
|
| 25 |
+
with open("formatted_playersinfo.txt", "r") as fp3:
|
| 26 |
+
content3 = fp3.read()
|
| 27 |
+
|
| 28 |
+
# Combine contents of both files, separated by three newlines
|
| 29 |
+
combined_content = content1 + "\n\n\n" + content2 + "\n\n\n" + content3
|
| 30 |
+
|
| 31 |
+
# Split the combined content into sections
|
| 32 |
+
s = combined_content.split("\n\n\n")
|
| 33 |
+
|
| 34 |
+
# Create a RAW_KNOWLEDGE_BASE using LangchainDocument
|
| 35 |
+
RAW_KNOWLEDGE_BASE = [
|
| 36 |
+
LangchainDocument(page_content=doc)
|
| 37 |
+
for doc in tqdm(s)
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
# Split and process documents (re-using your previous code)
|
| 41 |
+
MARKDOWN_SEPARATORS = ["\n#{1,6}", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n__+\n", "\n\n", "\n", " ", ""]
|
| 42 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 43 |
+
chunk_size=1000,
|
| 44 |
+
chunk_overlap=100,
|
| 45 |
+
add_start_index=True,
|
| 46 |
+
strip_whitespace=True,
|
| 47 |
+
separators=MARKDOWN_SEPARATORS,
|
| 48 |
+
)
|
| 49 |
+
docs_processed = []
|
| 50 |
+
for doc in RAW_KNOWLEDGE_BASE:
|
| 51 |
+
docs_processed += text_splitter.split_documents([doc])
|
| 52 |
+
|
| 53 |
+
# Tokenizer for checking lengths (optional visualization)
|
| 54 |
+
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
|
| 55 |
+
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
|
| 56 |
+
fig = pd.Series(lengths).hist()
|
| 57 |
+
fig.set_title("Histogram of Document Lengths")
|
| 58 |
+
plt.title("Distribution")
|
| 59 |
+
plt.show()
|
| 60 |
+
|
| 61 |
+
# Remove duplicate documents
|
| 62 |
+
unique_texts = {}
|
| 63 |
+
docs_processed_unique = []
|
| 64 |
+
for doc in docs_processed:
|
| 65 |
+
if doc.page_content not in unique_texts:
|
| 66 |
+
unique_texts[doc.page_content] = True
|
| 67 |
+
docs_processed_unique.append(doc)
|
| 68 |
+
docs_processed = docs_processed_unique
|
| 69 |
+
|
| 70 |
+
# Load the embedding model
|
| 71 |
+
embedding_model = HuggingFaceEmbeddings(
|
| 72 |
+
model_name=EMBEDDING_MODEL_NAME,
|
| 73 |
+
multi_process=True,
|
| 74 |
+
model_kwargs={"device": "cuda"},
|
| 75 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Create the FAISS vector store
|
| 79 |
+
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
|
| 80 |
+
docs_processed,
|
| 81 |
+
embedding_model,
|
| 82 |
+
distance_strategy=DistanceStrategy.COSINE,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Load the model and tokenizer
|
| 86 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
|
| 87 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 88 |
+
|
| 89 |
+
# Define the prompt template
|
| 90 |
+
prompt_chat = [
|
| 91 |
+
{
|
| 92 |
+
"role": "system",
|
| 93 |
+
"content": """Using the information contained in the context, Give a comprehensive answer to the question. Respond only to the question asked, response should be concise and relevant to the question. Provide the number of the source document when relevant. If the answer cannot be deduced from the context, do not give an answer.""",
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"role": "user",
|
| 97 |
+
"content": """Context: {context} --- Now here is the Question you need to answer. Question: {question}""",
|
| 98 |
+
},
|
| 99 |
+
]
|
| 100 |
+
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(prompt_chat, tokenize=False, add_generation_prompt=True)
|
| 101 |
+
|
| 102 |
+
# Define the generation arguments
|
| 103 |
+
generation_args = {
|
| 104 |
+
"max_new_tokens": 500,
|
| 105 |
+
"return_full_text": False,
|
| 106 |
+
"temperature": 0.0,
|
| 107 |
+
"do_sample": False,
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
def query_knowledge_base(u_query):
|
| 111 |
+
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=u_query, k=3)
|
| 112 |
+
context = retrieved_docs[0].page_content
|
| 113 |
+
final_prompt = RAG_PROMPT_TEMPLATE.format(question=u_query, context=context)
|
| 114 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
| 115 |
+
output = pipe(final_prompt, **generation_args)
|
| 116 |
+
return output[0]['generated_text']
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
u_query = "give the match summary of royal challengers bengaluru and mumbai indians in 2024"
|
| 120 |
+
print("YOUR QUESTION:\n", u_query, "\n")
|
| 121 |
+
print("MICROSOFT 128K ANSWER: \n", query_knowledge_base(u_query))
|