|
|
import streamlit as st |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from llama_index.core import VectorStoreIndex, ServiceContext |
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
|
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI |
|
|
from llama_index.schema import Document |
|
|
from PyPDF2 import PdfReader |
|
|
|
|
|
class DocumentLoader: |
|
|
@staticmethod |
|
|
def read_pdf(uploaded_file): |
|
|
pdf_reader = PdfReader(uploaded_file) |
|
|
text = "" |
|
|
for page_num in range(len(pdf_reader.pages)): |
|
|
text += pdf_reader.pages[page_num].extract_text() |
|
|
return text |
|
|
|
|
|
@staticmethod |
|
|
def load_documents(uploaded_pdf): |
|
|
file_contents = DocumentLoader.read_pdf(uploaded_pdf) |
|
|
return [Document(text=file_contents)] |
|
|
|
|
|
class IndexCreator: |
|
|
@staticmethod |
|
|
def create_index(documents, hf_token): |
|
|
llm = HuggingFaceInferenceAPI(model_name="HuggingFaceH4/zephyr-7b-alpha", token=hf_token) |
|
|
embed_model_uae = HuggingFaceEmbedding(model_name="WhereIsAI/UAE-Large-V1") |
|
|
|
|
|
service_context = ServiceContext.from_defaults( |
|
|
llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embed_model_uae |
|
|
) |
|
|
index = VectorStoreIndex.from_documents(documents, service_context=service_context, show_progress=True) |
|
|
index.storage_context.persist() |
|
|
return index.as_query_engine() |
|
|
|
|
|
class PDFQueryApp: |
|
|
def __init__(self): |
|
|
st.title("Private LLM @Purbayan_Majumder") |
|
|
st.write("Base Model : **HuggingFaceH4/zephyr-7b-alpha (open-source from HuggineFace)**") |
|
|
st.write("Embedding Model : **WhereIsAI/UAE-Large-V1(open-source from HuggineFace)**") |
|
|
st.write("Ask anything from the data that you upload") |
|
|
st.write("Note !! As its runnning on a CPU it takes times 5 to 8 mins for each response") |
|
|
|
|
|
self.hf_token = st.text_input("Enter your Hugging Face token [Free]:") |
|
|
self.uploaded_pdf = st.file_uploader("Upload your data[PDF for now]", type=['pdf']) |
|
|
self.query_engine = None |
|
|
|
|
|
def load_and_create_index(self): |
|
|
if self.uploaded_pdf: |
|
|
st.success("Dataset has been loaded into the model succesfully") |
|
|
documents = DocumentLoader.load_documents(self.uploaded_pdf) |
|
|
self.query_engine = IndexCreator.create_index(documents, self.hf_token) |
|
|
st.success("Vector embeddings have been succesfully created and initiated") |
|
|
else: |
|
|
st.warning("You have to upload a PDF file first.") |
|
|
|
|
|
def run_query(self, user_query): |
|
|
if self.query_engine and user_query: |
|
|
with st.spinner('Fetching the response from the model Please wait !!!!...'): |
|
|
response = self.query_engine.query(user_query) |
|
|
st.markdown(f"**Response:** {response}") |
|
|
else: |
|
|
st.warning("Please load documents and create vector embeddings before querying.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app = PDFQueryApp() |
|
|
|
|
|
app.load_and_create_index() |
|
|
|
|
|
user_query = st.text_input("Enter your query from the dataset:") |
|
|
|
|
|
app.run_query(user_query) |
|
|
|