Documentation
Browse files- app.py +26 -2
- utils/document_parsing.py +83 -2
- utils/llm_generation.py +46 -3
- utils/retrieval.py +12 -1
app.py
CHANGED
|
@@ -16,15 +16,39 @@ llm_model_name = "gpt-4o-mini"
|
|
| 16 |
llm_generator = None
|
| 17 |
|
| 18 |
|
| 19 |
-
def set_api_key(api_key: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
if api_key.strip():
|
| 21 |
os.environ["OPENAI_API_KEY"] = api_key
|
| 22 |
else:
|
| 23 |
raise gr.Error("Please provide a valid API key")
|
| 24 |
|
| 25 |
|
| 26 |
-
def process_inputs(api_key: str, pdf_file, questions: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
# Setup Api KEY
|
| 29 |
set_api_key(api_key)
|
| 30 |
|
|
|
|
| 16 |
llm_generator = None
|
| 17 |
|
| 18 |
|
| 19 |
+
def set_api_key(api_key: str) -> None:
|
| 20 |
+
"""
|
| 21 |
+
Sets the OpenAI API key as an environment variable.
|
| 22 |
+
|
| 23 |
+
Parameters:
|
| 24 |
+
api_key (str): The OpenAI API key to be set.
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
None: This function does not return any value.
|
| 28 |
+
|
| 29 |
+
Raises:
|
| 30 |
+
gr.Error: If the provided API key is empty or consists only of whitespace characters.
|
| 31 |
+
"""
|
| 32 |
if api_key.strip():
|
| 33 |
os.environ["OPENAI_API_KEY"] = api_key
|
| 34 |
else:
|
| 35 |
raise gr.Error("Please provide a valid API key")
|
| 36 |
|
| 37 |
|
| 38 |
+
def process_inputs(api_key: str, pdf_file, questions: str) -> str:
|
| 39 |
+
"""
|
| 40 |
+
This function processes the inputs, sets up the API key, validates the PDF file, parses the PDF,
|
| 41 |
+
creates a vector store, generates an LLM generator, validates the questions, retrieves top similar chunks,
|
| 42 |
+
generates answers, and returns the output in JSON format.
|
| 43 |
+
|
| 44 |
+
Parameters:
|
| 45 |
+
api_key (str): The OpenAI API key for accessing the LLM model.
|
| 46 |
+
pdf_file (File): The uploaded PDF file.
|
| 47 |
+
questions (str): The list of questions, one per line.
|
| 48 |
|
| 49 |
+
Returns:
|
| 50 |
+
str: The output in JSON format containing the answers to the questions.
|
| 51 |
+
"""
|
| 52 |
# Setup Api KEY
|
| 53 |
set_api_key(api_key)
|
| 54 |
|
utils/document_parsing.py
CHANGED
|
@@ -10,6 +10,17 @@ class DocParsing:
|
|
| 10 |
chunk_overlap = 50
|
| 11 |
|
| 12 |
def __init__(self, file_path, model_name, max_model_tokens=384):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
self.file_path = file_path
|
| 14 |
|
| 15 |
# Initialize the tokenizer for all-MiniLM
|
|
@@ -18,16 +29,59 @@ class DocParsing:
|
|
| 18 |
self.max_model_tokens = max_model_tokens
|
| 19 |
|
| 20 |
def process_pdf(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
self.load_pdf()
|
| 22 |
self.create_chunks()
|
| 23 |
return self.chunks
|
| 24 |
|
| 25 |
def load_pdf(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
loader = PyPDFLoader(self.file_path)
|
| 27 |
self.documents = loader.load()
|
| 28 |
|
| 29 |
def create_chunks(self):
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
self.chunks = []
|
| 32 |
for doc in self.documents:
|
| 33 |
self.chunks.extend(
|
|
@@ -37,10 +91,37 @@ class DocParsing:
|
|
| 37 |
)
|
| 38 |
|
| 39 |
def tokenize(self, text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
return self.tokenizer.encode(text, add_special_tokens=False)
|
| 41 |
|
| 42 |
def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
|
| 43 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
tokens = self.tokenize(doc.page_content)
|
| 45 |
chunks = []
|
| 46 |
start = 0
|
|
|
|
| 10 |
chunk_overlap = 50
|
| 11 |
|
| 12 |
def __init__(self, file_path, model_name, max_model_tokens=384):
|
| 13 |
+
"""
|
| 14 |
+
Initialize the DocParsing class with the provided file path, model name, and maximum model tokens.
|
| 15 |
+
|
| 16 |
+
Parameters:
|
| 17 |
+
file_path (str): The path to the PDF file to be processed.
|
| 18 |
+
model_name (str): The name of the transformer model to be used for tokenization.
|
| 19 |
+
max_model_tokens (int, optional): The maximum number of tokens allowed for each chunk. Defaults to 384.
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
None
|
| 23 |
+
"""
|
| 24 |
self.file_path = file_path
|
| 25 |
|
| 26 |
# Initialize the tokenizer for all-MiniLM
|
|
|
|
| 29 |
self.max_model_tokens = max_model_tokens
|
| 30 |
|
| 31 |
def process_pdf(self):
|
| 32 |
+
"""
|
| 33 |
+
Process the PDF file by loading it, splitting it into chunks, and returning the chunks.
|
| 34 |
+
|
| 35 |
+
This function first calls the `load_pdf` method to load the PDF file into a list of Document objects.
|
| 36 |
+
Then, it calls the `create_chunks` method to split each Document into smaller chunks based on the specified
|
| 37 |
+
chunk size and overlap. Finally, it returns the list of chunks.
|
| 38 |
+
|
| 39 |
+
Parameters:
|
| 40 |
+
None
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
list: A list of Document objects, where each Document represents a chunk of the PDF file.
|
| 44 |
+
"""
|
| 45 |
self.load_pdf()
|
| 46 |
self.create_chunks()
|
| 47 |
return self.chunks
|
| 48 |
|
| 49 |
def load_pdf(self):
|
| 50 |
+
"""
|
| 51 |
+
Load the PDF file specified by the file_path attribute into a list of Document objects.
|
| 52 |
+
|
| 53 |
+
This function uses the PyPDFLoader class from the langchain library to load the PDF file.
|
| 54 |
+
The loaded Document objects are stored in the self.documents attribute.
|
| 55 |
+
|
| 56 |
+
Parameters:
|
| 57 |
+
None
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
None
|
| 61 |
+
|
| 62 |
+
Raises:
|
| 63 |
+
FileNotFoundError: If the specified file_path does not exist or cannot be accessed.
|
| 64 |
+
"""
|
| 65 |
loader = PyPDFLoader(self.file_path)
|
| 66 |
self.documents = loader.load()
|
| 67 |
|
| 68 |
def create_chunks(self):
|
| 69 |
+
"""
|
| 70 |
+
Split the loaded PDF documents into smaller chunks based on the specified chunk size and overlap.
|
| 71 |
+
|
| 72 |
+
This function iterates through each Document object in the self.documents list and calls the
|
| 73 |
+
token_split_document method to split the Document into smaller chunks. The resulting chunks are
|
| 74 |
+
then appended to the self.chunks list.
|
| 75 |
+
|
| 76 |
+
Parameters:
|
| 77 |
+
None
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
None
|
| 81 |
+
|
| 82 |
+
Attributes:
|
| 83 |
+
self.chunks (list): A list of Document objects, where each Document represents a chunk of the PDF file.
|
| 84 |
+
"""
|
| 85 |
self.chunks = []
|
| 86 |
for doc in self.documents:
|
| 87 |
self.chunks.extend(
|
|
|
|
| 91 |
)
|
| 92 |
|
| 93 |
def tokenize(self, text):
|
| 94 |
+
"""
|
| 95 |
+
Tokenize the input text using the transformer model's tokenizer.
|
| 96 |
+
|
| 97 |
+
This method uses the tokenizer provided by the transformer model to encode the input text.
|
| 98 |
+
The special tokens are not added to the encoded tokens.
|
| 99 |
+
|
| 100 |
+
Parameters:
|
| 101 |
+
text (str): The input text to be tokenized.
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
list: A list of integers representing the tokenized input text.
|
| 105 |
+
"""
|
| 106 |
return self.tokenizer.encode(text, add_special_tokens=False)
|
| 107 |
|
| 108 |
def token_split_document(self, doc: Document, chunk_size=350, chunk_overlap=50):
|
| 109 |
+
"""
|
| 110 |
+
Split a single Document into multiple chunks based on token length.
|
| 111 |
+
|
| 112 |
+
This function tokenizes the input Document's page content, then splits the tokens into smaller chunks
|
| 113 |
+
of specified size. Overlapping chunks are created by moving the start index forward by the difference
|
| 114 |
+
between chunk size and overlap. Each chunk is then decoded back into text and a new Document is created
|
| 115 |
+
with the same metadata but truncated text.
|
| 116 |
+
|
| 117 |
+
Parameters:
|
| 118 |
+
doc (Document): The input Document to be split into chunks.
|
| 119 |
+
chunk_size (int, optional): The size of each chunk in tokens. Defaults to 350.
|
| 120 |
+
chunk_overlap (int, optional): The overlap between chunks in tokens. Defaults to 50.
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
list: A list of Document objects, where each Document represents a chunk of the input Document.
|
| 124 |
+
"""
|
| 125 |
tokens = self.tokenize(doc.page_content)
|
| 126 |
chunks = []
|
| 127 |
start = 0
|
utils/llm_generation.py
CHANGED
|
@@ -26,6 +26,15 @@ json_schema = {
|
|
| 26 |
|
| 27 |
class LLMGeneration:
|
| 28 |
def __init__(self, llm_model_name="gpt-4o-mini"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
self.llm_model_name = llm_model_name
|
| 30 |
self.llm = ChatOpenAI(
|
| 31 |
model_name=self.llm_model_name,
|
|
@@ -41,6 +50,18 @@ class LLMGeneration:
|
|
| 41 |
self.create_initial_prompt()
|
| 42 |
|
| 43 |
def create_initial_prompt(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# System message for the chain
|
| 45 |
system_message = SystemMessage(
|
| 46 |
content=(
|
|
@@ -61,8 +82,21 @@ class LLMGeneration:
|
|
| 61 |
|
| 62 |
self.initial_prompt_messages = [system_message] + few_shots
|
| 63 |
|
| 64 |
-
def create_human_message_prompt(self, query: str, docs: List[Document]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# Prepare the context from the retrieved chunks
|
| 67 |
context = "\n\n".join(
|
| 68 |
[f"<context>{doc.page_content}</context>" for doc in docs]
|
|
@@ -76,15 +110,24 @@ class LLMGeneration:
|
|
| 76 |
|
| 77 |
return HumanMessagePromptTemplate.from_template(human_message)
|
| 78 |
|
| 79 |
-
def generate_answer(self, query: str, docs: List[Document]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
# Create the prompt template
|
| 82 |
prompt = ChatPromptTemplate.from_messages(
|
| 83 |
self.initial_prompt_messages
|
| 84 |
+ [self.create_human_message_prompt(query, docs)]
|
| 85 |
)
|
| 86 |
|
| 87 |
-
# Create and run the chain with the
|
| 88 |
chain = LLMChain(
|
| 89 |
llm=self.llm,
|
| 90 |
prompt=prompt,
|
|
|
|
| 26 |
|
| 27 |
class LLMGeneration:
|
| 28 |
def __init__(self, llm_model_name="gpt-4o-mini"):
|
| 29 |
+
"""
|
| 30 |
+
Initialize the LLMGeneration class with a specified LLM model.
|
| 31 |
+
|
| 32 |
+
Parameters:
|
| 33 |
+
llm_model_name (str): The name of the LLM model to be used. Default is "gpt-4o-mini".
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
None
|
| 37 |
+
"""
|
| 38 |
self.llm_model_name = llm_model_name
|
| 39 |
self.llm = ChatOpenAI(
|
| 40 |
model_name=self.llm_model_name,
|
|
|
|
| 50 |
self.create_initial_prompt()
|
| 51 |
|
| 52 |
def create_initial_prompt(self):
|
| 53 |
+
"""
|
| 54 |
+
Prepares the initial prompt for the LLMChain.
|
| 55 |
+
|
| 56 |
+
This function creates a system message and few-shot examples for the LLMChain.
|
| 57 |
+
The system message instructs the assistant to use the provided context to answer the user's question,
|
| 58 |
+
and to follow a structured JSON format for the answer. It also specifies the conditions for providing an answer.
|
| 59 |
+
|
| 60 |
+
The few-shot examples include a context and a question, along with the expected answer in JSON format.
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
None. The initial prompt messages are stored in the `initial_prompt_messages` attribute of the class instance.
|
| 64 |
+
"""
|
| 65 |
# System message for the chain
|
| 66 |
system_message = SystemMessage(
|
| 67 |
content=(
|
|
|
|
| 82 |
|
| 83 |
self.initial_prompt_messages = [system_message] + few_shots
|
| 84 |
|
| 85 |
+
def create_human_message_prompt(self, query: str, docs: List[Document]) -> HumanMessagePromptTemplate:
|
| 86 |
+
"""
|
| 87 |
+
Prepares a human message prompt for the LLMChain.
|
| 88 |
+
|
| 89 |
+
This function constructs a human message that includes the provided context and a question.
|
| 90 |
+
The context is extracted from the list of documents and formatted as per the required structure.
|
| 91 |
+
The question is included in the human message.
|
| 92 |
|
| 93 |
+
Parameters:
|
| 94 |
+
query (str): The user's question for which an answer needs to be generated.
|
| 95 |
+
docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
HumanMessagePromptTemplate: A human message prompt template that can be used with the LLMChain.
|
| 99 |
+
"""
|
| 100 |
# Prepare the context from the retrieved chunks
|
| 101 |
context = "\n\n".join(
|
| 102 |
[f"<context>{doc.page_content}</context>" for doc in docs]
|
|
|
|
| 110 |
|
| 111 |
return HumanMessagePromptTemplate.from_template(human_message)
|
| 112 |
|
| 113 |
+
def generate_answer(self, query: str, docs: List[Document]) -> str:
|
| 114 |
+
"""
|
| 115 |
+
Generate an answer to the user's query using the provided documents and the LLM model.
|
| 116 |
+
|
| 117 |
+
Parameters:
|
| 118 |
+
query (str): The user's question for which an answer needs to be generated.
|
| 119 |
+
docs (List[Document]): A list of documents retrieved from the search engine. Each document contains a 'page_content' attribute.
|
| 120 |
|
| 121 |
+
Returns:
|
| 122 |
+
str: The answer to the user's query. If no answer is found, returns an empty string.
|
| 123 |
+
"""
|
| 124 |
# Create the prompt template
|
| 125 |
prompt = ChatPromptTemplate.from_messages(
|
| 126 |
self.initial_prompt_messages
|
| 127 |
+ [self.create_human_message_prompt(query, docs)]
|
| 128 |
)
|
| 129 |
|
| 130 |
+
# Create and run the chain with the gpt-40-mini model
|
| 131 |
chain = LLMChain(
|
| 132 |
llm=self.llm,
|
| 133 |
prompt=prompt,
|
utils/retrieval.py
CHANGED
|
@@ -6,6 +6,16 @@ from typing import List
|
|
| 6 |
|
| 7 |
class Retrieval:
|
| 8 |
def __init__(self, model_name, max_model_tokens=384):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
self.model_name = model_name
|
| 10 |
self.embeddings = HuggingFaceEmbeddings(
|
| 11 |
model_name=model_name,
|
|
@@ -13,12 +23,13 @@ class Retrieval:
|
|
| 13 |
)
|
| 14 |
|
| 15 |
def create_vector_store(self, chunks: List[Document]):
|
| 16 |
-
|
| 17 |
self.chunks = chunks
|
| 18 |
# Create FAISS vector store
|
| 19 |
self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
|
| 20 |
|
| 21 |
def search(self, query, k=10) -> List[Document]:
|
|
|
|
| 22 |
# Retrieve top 10 similar chunks
|
| 23 |
similar_docs = self.vectorstore.similarity_search(query, k)
|
| 24 |
|
|
|
|
| 6 |
|
| 7 |
class Retrieval:
|
| 8 |
def __init__(self, model_name, max_model_tokens=384):
|
| 9 |
+
"""
|
| 10 |
+
Initialize Retrieval class with HuggingFace embeddings and FAISS vector store.
|
| 11 |
+
|
| 12 |
+
Parameters:
|
| 13 |
+
model_name (str): The name of the HuggingFace model to use for embeddings.
|
| 14 |
+
max_model_tokens (int, optional): The maximum number of tokens to use for encoding. Defaults to 384.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
None
|
| 18 |
+
"""
|
| 19 |
self.model_name = model_name
|
| 20 |
self.embeddings = HuggingFaceEmbeddings(
|
| 21 |
model_name=model_name,
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
def create_vector_store(self, chunks: List[Document]):
|
| 26 |
+
"""Creates a new vector store for similarity search"""
|
| 27 |
self.chunks = chunks
|
| 28 |
# Create FAISS vector store
|
| 29 |
self.vectorstore = FAISS.from_documents(self.chunks, self.embeddings)
|
| 30 |
|
| 31 |
def search(self, query, k=10) -> List[Document]:
|
| 32 |
+
"""Search top matching documents"""
|
| 33 |
# Retrieve top 10 similar chunks
|
| 34 |
similar_docs = self.vectorstore.similarity_search(query, k)
|
| 35 |
|