Spaces:
Sleeping
Sleeping
Update app/app.py
Browse files- app/app.py +111 -101
app/app.py
CHANGED
|
@@ -1,149 +1,159 @@
|
|
| 1 |
# Import necessary libraries
|
| 2 |
import os # This library helps us work with files and folders on the computer.
|
| 3 |
-
import streamlit as st #
|
| 4 |
-
from dotenv import load_dotenv # This library loads
|
| 5 |
-
from langchain.chains import RetrievalQAWithSourcesChain #
|
| 6 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter #
|
| 7 |
-
from langchain.document_loaders import WebBaseLoader # This
|
| 8 |
-
from langchain_groq import ChatGroq #
|
| 9 |
-
from langchain_google_genai import GoogleGenerativeAIEmbeddings # This creates
|
| 10 |
-
from langchain.vectorstores import FAISS #
|
| 11 |
-
import
|
| 12 |
-
|
| 13 |
-
#
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Load environment variables from the .env file
|
| 17 |
-
load_dotenv() #
|
| 18 |
-
api_key = os.getenv("GROQ_API_KEY") #
|
| 19 |
-
api_key_google = os.getenv("GOOGLE_API_KEY")
|
| 20 |
|
| 21 |
-
#
|
| 22 |
def initialize_model(api_key: str) -> ChatGroq:
|
| 23 |
"""This function sets up the ChatGroq model using the secret key."""
|
| 24 |
return ChatGroq(
|
| 25 |
-
model="mixtral-8x7b-32768", #
|
| 26 |
-
temperature=0, #
|
| 27 |
-
max_tokens=None, #
|
| 28 |
-
timeout=None, #
|
| 29 |
-
max_retries=5, #
|
| 30 |
-
api_key=api_key #
|
| 31 |
)
|
| 32 |
|
| 33 |
-
#
|
| 34 |
def load_url_data(url: str) -> list:
|
| 35 |
"""This function gets the text from a web page at the given URL."""
|
| 36 |
-
loader = WebBaseLoader([url]) #
|
| 37 |
-
return loader.load() #
|
| 38 |
|
| 39 |
-
#
|
| 40 |
def split_documents(data: list) -> list:
|
| 41 |
"""This function breaks long pieces of text into smaller, manageable chunks."""
|
| 42 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 43 |
-
chunk_size=1500, # Each
|
| 44 |
-
separators=["\n", "\n\n", " ", ""], #
|
| 45 |
-
chunk_overlap=20 #
|
| 46 |
)
|
| 47 |
-
return text_splitter.split_documents(data) #
|
| 48 |
|
| 49 |
-
#
|
| 50 |
def create_faiss_index(docs: list, index_path: str) -> None:
|
| 51 |
-
"""This function makes
|
| 52 |
-
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") #
|
| 53 |
-
vectorstore = FAISS.from_documents(docs, embeddings) #
|
| 54 |
-
vectorstore.save_local(index_path) #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# Main application logic
|
| 57 |
def main() -> None:
|
| 58 |
"""This is where our main app runs and does all the work."""
|
| 59 |
|
| 60 |
-
# Custom CSS for
|
| 61 |
st.markdown(
|
| 62 |
"""
|
| 63 |
<style>
|
| 64 |
-
.stTitle { font-size: 40px; }
|
| 65 |
-
.stSidebar { font-size: 20px; }
|
| 66 |
-
.stTextInput, .stButton { font-size: 18px; }
|
| 67 |
-
.stTextArea { font-size: 16px; }
|
| 68 |
</style>
|
| 69 |
""",
|
| 70 |
-
unsafe_allow_html=True
|
| 71 |
)
|
| 72 |
|
| 73 |
-
st.title("News Research Tool π") #
|
| 74 |
-
|
| 75 |
-
st.sidebar.title("News Article URL") # We set a title in the sidebar for the URL input.
|
| 76 |
|
| 77 |
-
url = st.sidebar.text_input("Enter the news article URL π°") #
|
| 78 |
-
process_url_clicked = st.sidebar.button("Process URL π") #
|
| 79 |
-
index_path = "faiss_index" #
|
| 80 |
-
main_placeholder = st.empty() #
|
| 81 |
|
| 82 |
-
# Initialize chat history in session state
|
| 83 |
-
if "chat_history" not in st.session_state: # Check if
|
| 84 |
-
st.session_state.chat_history = [] # If not,
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
llm = initialize_model(api_key) #
|
| 87 |
|
| 88 |
-
if process_url_clicked: # If the button to process the URL
|
| 89 |
-
if not url: # Check if the user
|
| 90 |
st.sidebar.error("Please provide a URL π") # Show an error message.
|
| 91 |
-
elif not (url.startswith("http://") or url.startswith("https://")): # Check if the URL is
|
| 92 |
-
st.sidebar.error("Invalid URL format π«. Please include 'http://' or 'https://' π.") # Show an error if
|
| 93 |
else: # If the URL is valid:
|
| 94 |
try:
|
| 95 |
-
with st.spinner("Loading data from URL... π€"): # Show a loading
|
| 96 |
-
data = load_url_data(url) #
|
| 97 |
|
| 98 |
-
with st.spinner("Splitting text into chunks... βοΈ"): #
|
| 99 |
-
docs = split_documents(data) #
|
| 100 |
|
| 101 |
-
with st.spinner("Creating embeddings... π"): #
|
| 102 |
-
create_faiss_index(docs, index_path) # Create embeddings and save the index.
|
|
|
|
| 103 |
|
| 104 |
-
st.success("Data processed and index saved! You can now ask questions π") #
|
| 105 |
|
| 106 |
-
except Exception as e: # If
|
| 107 |
-
logging.error(f"Error processing URL: {e}") # Log the error for
|
| 108 |
-
st.sidebar.error(f"Error processing URL: {e} π") # Show an error in the sidebar.
|
| 109 |
st.error("Error occurred while processing the URL. Please try again. π") # Show a general error message.
|
| 110 |
|
| 111 |
-
#
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
# Display sources if available
|
| 133 |
-
sources = result.get("sources", "") # Get the sources for the answer.
|
| 134 |
-
if sources: # If there are sources:
|
| 135 |
-
st.subheader("Sources π:") # Show a subheader for sources.
|
| 136 |
-
sources_list = sources.split("\n") # Split the sources by new lines.
|
| 137 |
-
for source in sources_list: # Go through each source:
|
| 138 |
-
st.write(source) # Show each source.
|
| 139 |
-
|
| 140 |
else:
|
| 141 |
-
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
|
| 144 |
-
logging.error(f"Error retrieving answer: {e}") # Log the error.
|
| 145 |
-
st.error(f"Error retrieving answer: {e} π’") # Show an error message.
|
| 146 |
|
| 147 |
# Run the main application function
|
| 148 |
-
if __name__ == "__main__": # Check if this
|
| 149 |
-
main() #
|
|
|
|
| 1 |
# Import necessary libraries
|
| 2 |
import os # This library helps us work with files and folders on the computer.
|
| 3 |
+
import streamlit as st # Streamlit is a framework that helps us create interactive web apps.
|
| 4 |
+
from dotenv import load_dotenv # This library loads environment variables from a .env file, used for configuration.
|
| 5 |
+
from langchain.chains import RetrievalQAWithSourcesChain # A component that allows us to answer questions based on retrieved documents.
|
| 6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter # Used to split large texts into smaller, manageable parts.
|
| 7 |
+
from langchain.document_loaders import WebBaseLoader # This is used to fetch text from web pages.
|
| 8 |
+
from langchain_groq import ChatGroq # A specific chat model for generating responses.
|
| 9 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings # This creates embeddings for text to help with search and retrieval.
|
| 10 |
+
from langchain.vectorstores import FAISS # A library for storing and searching vector representations of documents.
|
| 11 |
+
from langchain_core.prompts import ChatPromptTemplate # For crafting prompts for chat interactions.
|
| 12 |
+
from langchain_core.messages import AIMessage, HumanMessage # To manage messages exchanged in the chat interface.
|
| 13 |
+
import logging # This library helps track errors and important events in the application.
|
| 14 |
+
|
| 15 |
+
# Setup logging to capture important information and errors
|
| 16 |
+
logging.basicConfig(level=logging.INFO) # Set the logging level to INFO to record informative messages.
|
| 17 |
|
| 18 |
# Load environment variables from the .env file
|
| 19 |
+
load_dotenv() # Load environment variables, which may include sensitive data like API keys.
|
| 20 |
+
api_key = os.getenv("GROQ_API_KEY") # Retrieve the API key for the ChatGroq model.
|
| 21 |
+
api_key_google = os.getenv("GOOGLE_API_KEY") # Retrieve the Google API key (if needed).
|
| 22 |
|
| 23 |
+
# Function to initialize the ChatGroq model
|
| 24 |
def initialize_model(api_key: str) -> ChatGroq:
|
| 25 |
"""This function sets up the ChatGroq model using the secret key."""
|
| 26 |
return ChatGroq(
|
| 27 |
+
model="mixtral-8x7b-32768", # Specify the model name for ChatGroq.
|
| 28 |
+
temperature=0, # A temperature of 0 means responses are more deterministic (less random).
|
| 29 |
+
max_tokens=None, # No limit on the number of tokens in the response.
|
| 30 |
+
timeout=None, # No timeout for responses.
|
| 31 |
+
max_retries=5, # The model will retry up to 5 times if it fails to generate a response.
|
| 32 |
+
api_key=api_key # Use the provided API key for authentication.
|
| 33 |
)
|
| 34 |
|
| 35 |
+
# Function to load data from a given URL
|
| 36 |
def load_url_data(url: str) -> list:
|
| 37 |
"""This function gets the text from a web page at the given URL."""
|
| 38 |
+
loader = WebBaseLoader([url]) # Create a loader to fetch text from the provided URL.
|
| 39 |
+
return loader.load() # Load and return the content from that web page.
|
| 40 |
|
| 41 |
+
# Function to split long documents into smaller chunks
|
| 42 |
def split_documents(data: list) -> list:
|
| 43 |
"""This function breaks long pieces of text into smaller, manageable chunks."""
|
| 44 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 45 |
+
chunk_size=1500, # Each chunk will have a maximum size of 1500 characters.
|
| 46 |
+
separators=["\n", "\n\n", " ", ""], # Various ways to split the text.
|
| 47 |
+
chunk_overlap=20 # Allow for 20 characters to overlap between chunks for context.
|
| 48 |
)
|
| 49 |
+
return text_splitter.split_documents(data) # Split the text and return the smaller parts.
|
| 50 |
|
| 51 |
+
# Function to create embeddings and save them in a FAISS index
|
| 52 |
def create_faiss_index(docs: list, index_path: str) -> None:
|
| 53 |
+
"""This function makes embeddings from the text and saves them in a FAISS index."""
|
| 54 |
+
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") # Create embeddings using the specified model.
|
| 55 |
+
vectorstore = FAISS.from_documents(docs, embeddings) # Create a searchable index from the documents and their embeddings.
|
| 56 |
+
vectorstore.save_local(index_path) # Save this index to a local file.
|
| 57 |
+
return vectorstore # Return the created vectorstore for later use.
|
| 58 |
+
|
| 59 |
+
# Function to generate a response from the AI based on user input
|
| 60 |
+
def get_response(user_query: str, vectorstore, chat_history, llm) -> str:
|
| 61 |
+
"""Generates a response from the AI model based on the user query."""
|
| 62 |
+
# Create a template for the chat prompt using the user query
|
| 63 |
+
prompt_template = ChatPromptTemplate(
|
| 64 |
+
messages=[HumanMessage(content=user_query)] # Include the user's query in the message template.
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Set up the RetrievalQAWithSourcesChain to find answers based on the query
|
| 68 |
+
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(k=5)) # Use the vectorstore to retrieve relevant documents.
|
| 69 |
+
result = chain({"question": user_query}, return_only_outputs=True) # Get the answer from the chain.
|
| 70 |
+
|
| 71 |
+
return result["answer"] # Return the generated answer.
|
| 72 |
|
| 73 |
# Main application logic
|
| 74 |
def main() -> None:
|
| 75 |
"""This is where our main app runs and does all the work."""
|
| 76 |
|
| 77 |
+
# Custom CSS for modifying the appearance of the app
|
| 78 |
st.markdown(
|
| 79 |
"""
|
| 80 |
<style>
|
| 81 |
+
.stTitle { font-size: 40px; } # Set the font size for the title.
|
| 82 |
+
.stSidebar { font-size: 20px; } # Set the font size for sidebar elements.
|
| 83 |
+
.stTextInput, .stButton { font-size: 18px; } # Set font size for input fields and buttons.
|
| 84 |
+
.stTextArea { font-size: 16px; } # Set font size for text areas.
|
| 85 |
</style>
|
| 86 |
""",
|
| 87 |
+
unsafe_allow_html=True # Allow HTML for custom styling.
|
| 88 |
)
|
| 89 |
|
| 90 |
+
st.title("News Research Tool π") # Set the main title of the application.
|
| 91 |
+
st.sidebar.title("News Article URL") # Title for the sidebar input.
|
|
|
|
| 92 |
|
| 93 |
+
url = st.sidebar.text_input("Enter the news article URL π°") # Prompt user to enter a news article URL.
|
| 94 |
+
process_url_clicked = st.sidebar.button("Process URL π") # Button to trigger URL processing.
|
| 95 |
+
index_path = "faiss_index" # Filename for saving the FAISS index.
|
| 96 |
+
main_placeholder = st.empty() # Create a placeholder for dynamic content in the main area.
|
| 97 |
|
| 98 |
+
# Initialize chat history and vectorstore in session state
|
| 99 |
+
if "chat_history" not in st.session_state: # Check if chat history already exists.
|
| 100 |
+
st.session_state.chat_history = [] # If not, create a new list to store chat history.
|
| 101 |
+
if "vectorstore" not in st.session_state: # Check if vectorstore is initialized.
|
| 102 |
+
st.session_state.vectorstore = None # Initialize vectorstore to None.
|
| 103 |
|
| 104 |
+
llm = initialize_model(api_key) # Initialize the chat model using the provided API key.
|
| 105 |
|
| 106 |
+
if process_url_clicked: # If the user clicks the button to process the URL:
|
| 107 |
+
if not url: # Check if the user did not enter a URL.
|
| 108 |
st.sidebar.error("Please provide a URL π") # Show an error message.
|
| 109 |
+
elif not (url.startswith("http://") or url.startswith("https://")): # Check if the URL format is correct.
|
| 110 |
+
st.sidebar.error("Invalid URL format π«. Please include 'http://' or 'https://' π.") # Show an error if the URL is invalid.
|
| 111 |
else: # If the URL is valid:
|
| 112 |
try:
|
| 113 |
+
with st.spinner("Loading data from URL... π€"): # Show a loading spinner while data is loading.
|
| 114 |
+
data = load_url_data(url) # Load data from the URL.
|
| 115 |
|
| 116 |
+
with st.spinner("Splitting text into chunks... βοΈ"): # Indicate that the text is being split.
|
| 117 |
+
docs = split_documents(data) # Split the text into smaller chunks.
|
| 118 |
|
| 119 |
+
with st.spinner("Creating embeddings... π"): # Indicate that embeddings are being created.
|
| 120 |
+
vectorstore = create_faiss_index(docs, index_path) # Create embeddings and save the index.
|
| 121 |
+
st.session_state.vectorstore = vectorstore # Store the vectorstore in session state.
|
| 122 |
|
| 123 |
+
st.success("Data processed and index saved! You can now ask questions π") # Notify the user that processing is complete.
|
| 124 |
|
| 125 |
+
except Exception as e: # If an error occurs during the process:
|
| 126 |
+
logging.error(f"Error processing URL: {e}") # Log the error for debugging.
|
| 127 |
+
st.sidebar.error(f"Error processing URL: {e} π") # Show an error message in the sidebar.
|
| 128 |
st.error("Error occurred while processing the URL. Please try again. π") # Show a general error message.
|
| 129 |
|
| 130 |
+
# Display the chat history (both AI and user messages)
|
| 131 |
+
for message in st.session_state.chat_history: # Iterate over the stored chat messages.
|
| 132 |
+
if isinstance(message, AIMessage): # Check if the message is from the AI.
|
| 133 |
+
with st.chat_message("AI"): # Display AI messages in the chat interface.
|
| 134 |
+
st.markdown(message.content) # Render the AI message content.
|
| 135 |
+
elif isinstance(message, HumanMessage): # Check if the message is from the user.
|
| 136 |
+
with st.chat_message("Human"): # Display human messages in the chat interface.
|
| 137 |
+
st.markdown(message.content) # Render the user message content.
|
| 138 |
+
|
| 139 |
+
# Input field for the user to type their message
|
| 140 |
+
user_query = st.chat_input("Type a message...") # Input field for the user to enter their query.
|
| 141 |
+
if user_query and user_query.strip(): # If the user entered a valid, non-empty query:
|
| 142 |
+
st.session_state.chat_history.append(HumanMessage(content=user_query)) # Append the user query to chat history.
|
| 143 |
+
|
| 144 |
+
with st.chat_message("Human"): # Display the user's message in the chat interface.
|
| 145 |
+
st.markdown(user_query) # Render the user's message.
|
| 146 |
+
|
| 147 |
+
with st.chat_message("AI"): # Generate and display the AI response.
|
| 148 |
+
if st.session_state.vectorstore: # Ensure that the vectorstore has been initialized.
|
| 149 |
+
response = get_response(user_query, st.session_state.vectorstore, st.session_state.chat_history, llm) # Get the AI's response.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
else:
|
| 151 |
+
response = "Please process a URL first before asking questions." # Inform the user to process a URL first.
|
| 152 |
+
|
| 153 |
+
st.markdown(response) # Show the AI's response.
|
| 154 |
|
| 155 |
+
st.session_state.chat_history.append(AIMessage(content=response)) # Add the AI's response to chat history.
|
|
|
|
|
|
|
| 156 |
|
| 157 |
# Run the main application function
|
| 158 |
+
if __name__ == "__main__": # Check if this script is being run as the main program.
|
| 159 |
+
main() # Start the application.
|