Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
# HuggingFace Spaces application to anlayze uploaded PDF files
|
| 5 |
# with open-source models ( hkunlp/instructor-xl )
|
| 6 |
#
|
| 7 |
-
# Mike Pastor February
|
| 8 |
|
| 9 |
|
| 10 |
import streamlit as st
|
|
@@ -25,16 +25,14 @@ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
|
| 25 |
|
| 26 |
# from langchain.vectorstores import FAISS
|
| 27 |
from langchain_community.vectorstores import FAISS
|
| 28 |
-
|
| 29 |
from langchain.text_splitter import CharacterTextSplitter
|
| 30 |
-
|
| 31 |
from langchain.memory import ConversationBufferMemory
|
| 32 |
from langchain.chains import ConversationalRetrievalChain
|
| 33 |
|
| 34 |
-
|
| 35 |
# from langchain.llms import HuggingFaceHub
|
| 36 |
from langchain_community.llms import HuggingFaceHub
|
| 37 |
|
|
|
|
| 38 |
def extract_pdf_text(pdf_docs):
|
| 39 |
text = ""
|
| 40 |
for pdf in pdf_docs:
|
|
@@ -43,6 +41,7 @@ def extract_pdf_text(pdf_docs):
|
|
| 43 |
text += page.extract_text()
|
| 44 |
return text
|
| 45 |
|
|
|
|
| 46 |
# Chunk size and overlap must not exceed the models capacity!
|
| 47 |
#
|
| 48 |
def extract_bitesize_pieces(text):
|
|
@@ -55,7 +54,7 @@ def extract_bitesize_pieces(text):
|
|
| 55 |
chunks = text_splitter.split_text(text)
|
| 56 |
return chunks
|
| 57 |
|
| 58 |
-
|
| 59 |
def prepare_embedding_vectors(text_chunks):
|
| 60 |
|
| 61 |
st.write('Here in vector store....', unsafe_allow_html=True)
|
|
@@ -82,7 +81,8 @@ def prepare_embedding_vectors(text_chunks):
|
|
| 82 |
st.write('FAISS succeeds: ')
|
| 83 |
|
| 84 |
return vectorstore
|
| 85 |
-
|
|
|
|
| 86 |
def prepare_conversation(vectorstore):
|
| 87 |
# llm = ChatOpenAI()
|
| 88 |
# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
|
|
@@ -98,6 +98,7 @@ def prepare_conversation(vectorstore):
|
|
| 98 |
)
|
| 99 |
return conversation_chain
|
| 100 |
|
|
|
|
| 101 |
def process_user_question(user_question):
|
| 102 |
|
| 103 |
print('process_user_question called: \n')
|
|
@@ -169,19 +170,22 @@ def main():
|
|
| 169 |
# st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=im )
|
| 170 |
# st.set_page_config(page_title="Pennwick PDF Analyzer")
|
| 171 |
|
| 172 |
-
import base64
|
| 173 |
-
from PIL import Image
|
| 174 |
|
| 175 |
-
# Open your image
|
| 176 |
-
image = Image.open("robot_icon.ico")
|
| 177 |
|
| 178 |
-
# Convert image to base64 string
|
| 179 |
-
with open("robot_icon.ico", "rb") as f:
|
| 180 |
-
|
| 181 |
|
| 182 |
-
# Set page config with base64 string
|
| 183 |
-
st.set_page_config(page_title="Pennwick File Analyzer 2", page_icon=f"data:image/ico;base64,{encoded_string}")
|
|
|
|
| 184 |
|
|
|
|
|
|
|
| 185 |
print( 'prepared page...\n')
|
| 186 |
|
| 187 |
|
|
@@ -194,8 +198,11 @@ def main():
|
|
| 194 |
if "chat_history" not in st.session_state:
|
| 195 |
st.session_state.chat_history = None
|
| 196 |
|
| 197 |
-
# st.header("Pennwick File Analyzer :
|
| 198 |
-
st.header("Pennwick File Analyzer 2")
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
user_question = None
|
| 201 |
user_question = st.text_input("Ask the Open Source - Flan-t5 Model a question about your uploaded documents:")
|
|
|
|
| 4 |
# HuggingFace Spaces application to anlayze uploaded PDF files
|
| 5 |
# with open-source models ( hkunlp/instructor-xl )
|
| 6 |
#
|
| 7 |
+
# Mike Pastor February 17, 2024
|
| 8 |
|
| 9 |
|
| 10 |
import streamlit as st
|
|
|
|
| 25 |
|
| 26 |
# from langchain.vectorstores import FAISS
|
| 27 |
from langchain_community.vectorstores import FAISS
|
|
|
|
| 28 |
from langchain.text_splitter import CharacterTextSplitter
|
|
|
|
| 29 |
from langchain.memory import ConversationBufferMemory
|
| 30 |
from langchain.chains import ConversationalRetrievalChain
|
| 31 |
|
|
|
|
| 32 |
# from langchain.llms import HuggingFaceHub
|
| 33 |
from langchain_community.llms import HuggingFaceHub
|
| 34 |
|
| 35 |
+
##################################################################################
|
| 36 |
def extract_pdf_text(pdf_docs):
|
| 37 |
text = ""
|
| 38 |
for pdf in pdf_docs:
|
|
|
|
| 41 |
text += page.extract_text()
|
| 42 |
return text
|
| 43 |
|
| 44 |
+
##################################################################################
|
| 45 |
# Chunk size and overlap must not exceed the models capacity!
|
| 46 |
#
|
| 47 |
def extract_bitesize_pieces(text):
|
|
|
|
| 54 |
chunks = text_splitter.split_text(text)
|
| 55 |
return chunks
|
| 56 |
|
| 57 |
+
##################################################################################
|
| 58 |
def prepare_embedding_vectors(text_chunks):
|
| 59 |
|
| 60 |
st.write('Here in vector store....', unsafe_allow_html=True)
|
|
|
|
| 81 |
st.write('FAISS succeeds: ')
|
| 82 |
|
| 83 |
return vectorstore
|
| 84 |
+
|
| 85 |
+
##################################################################################
|
| 86 |
def prepare_conversation(vectorstore):
|
| 87 |
# llm = ChatOpenAI()
|
| 88 |
# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
|
|
|
|
| 98 |
)
|
| 99 |
return conversation_chain
|
| 100 |
|
| 101 |
+
##################################################################################
|
| 102 |
def process_user_question(user_question):
|
| 103 |
|
| 104 |
print('process_user_question called: \n')
|
|
|
|
| 170 |
# st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=im )
|
| 171 |
# st.set_page_config(page_title="Pennwick PDF Analyzer")
|
| 172 |
|
| 173 |
+
# import base64
|
| 174 |
+
# from PIL import Image
|
| 175 |
|
| 176 |
+
# # Open your image
|
| 177 |
+
# image = Image.open("robot_icon.ico")
|
| 178 |
|
| 179 |
+
# # Convert image to base64 string
|
| 180 |
+
# with open("robot_icon.ico", "rb") as f:
|
| 181 |
+
# encoded_string = base64.b64encode(f.read()).decode()
|
| 182 |
|
| 183 |
+
# # Set page config with base64 string
|
| 184 |
+
# st.set_page_config(page_title="Pennwick File Analyzer 2", page_icon=f"data:image/ico;base64,{encoded_string}")
|
| 185 |
+
|
| 186 |
|
| 187 |
+
st.set_page_config(page_title="Pennwick File Analyzer", page_icon="./robot_icon.ico")
|
| 188 |
+
|
| 189 |
print( 'prepared page...\n')
|
| 190 |
|
| 191 |
|
|
|
|
| 198 |
if "chat_history" not in st.session_state:
|
| 199 |
st.session_state.chat_history = None
|
| 200 |
|
| 201 |
+
# st.header("Pennwick File Analyzer :shark:")
|
| 202 |
+
# st.header("Pennwick File Analyzer 2")
|
| 203 |
+
|
| 204 |
+
st.image("robot_icon.png", width=96 )
|
| 205 |
+
st.header(f"Pennwick File Analyzer")
|
| 206 |
|
| 207 |
user_question = None
|
| 208 |
user_question = st.text_input("Ask the Open Source - Flan-t5 Model a question about your uploaded documents:")
|