Spaces:

SURESHBEEKHANI
/

News-Research-Tool

Sleeping

App Files Files Community

SURESHBEEKHANI commited on Sep 24, 2024

Commit

b835248

verified ·

1 Parent(s): 5cc0a3b

Update app/app.py

Browse files

Files changed (1) hide show

app/app.py +111 -101

app/app.py CHANGED Viewed

@@ -1,149 +1,159 @@
 # Import necessary libraries
 import os  # This library helps us work with files and folders on the computer.
-import streamlit as st  # This library helps us create a web app easily..
-from dotenv import load_dotenv  # This library loads secret information from a special file.
-from langchain.chains import RetrievalQAWithSourcesChain  # This helps us answer questions and show where the answers come from.
-from langchain.text_splitter import RecursiveCharacterTextSplitter  # This helps us break long texts into smaller parts.
-from langchain.document_loaders import WebBaseLoader  # This helps us grab text from web pages.
-from langchain_groq import ChatGroq  # This is a special model that can chat and answer questions.
-from langchain_google_genai import GoogleGenerativeAIEmbeddings  # This creates special math objects called "embeddings" to help find answers.
-from langchain.vectorstores import FAISS  # This helps us store and search through our text easily.
-import logging  # This helps us keep track of any problems that happen in our code.
-# Setup logging
-logging.basicConfig(level=logging.INFO)  # We set up logging to show important messages.
 # Load environment variables from the .env file
-load_dotenv()  # This loads secret information, like API keys, from a special file called .env.
-api_key = os.getenv("GROQ_API_KEY")  # We get the secret key we need to use the ChatGroq model.
-api_key_google = os.getenv("GOOGLE_API_KEY")
-# Initialize the ChatGroq model
 def initialize_model(api_key: str) -> ChatGroq:
     """This function sets up the ChatGroq model using the secret key."""
     return ChatGroq(
-        model="mixtral-8x7b-32768",  # This is the name of the chat model we want to use.
-        temperature=0,  # This controls how random the responses are (0 means more predictable).
-        max_tokens=None,  # This allows us to have as many words as we want in the response.
-        timeout=None,  # This means we won't set a time limit for how long to wait for an answer.
-        max_retries=5,  # This allows the program to try getting an answer up to 5 times if it fails.
-        api_key=api_key  # We use our secret key to access the model.
     )
-# Load and process the URL
 def load_url_data(url: str) -> list:
     """This function gets the text from a web page at the given URL."""
-    loader = WebBaseLoader([url])  # We create a loader to grab text from the URL.
-    return loader.load()  # We load and return the content from that web page.
-# Split documents into chunks
 def split_documents(data: list) -> list:
     """This function breaks long pieces of text into smaller, manageable chunks."""
     text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1500,  # Each piece of text will be no longer than 1500 characters.
-        separators=["\n", "\n\n", " ", ""],  # These are the ways we can split the text.
-        chunk_overlap=20  # Each piece will overlap with the next by 20 characters.
     )
-    return text_splitter.split_documents(data)  # We split the text and return the smaller pieces.
-# Create embeddings and save FAISS index
 def create_faiss_index(docs: list, index_path: str) -> None:
-    """This function makes math objects (embeddings) from the text and saves them."""
-    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")  # We create embeddings using a specific model.
-    vectorstore = FAISS.from_documents(docs, embeddings)  # We make a searchable index from the documents and embeddings.
-    vectorstore.save_local(index_path)  # We save this index on our computer.
 # Main application logic
 def main() -> None:
     """This is where our main app runs and does all the work."""
-    # Custom CSS for font size
     st.markdown(
         """
         <style>
-        .stTitle { font-size: 40px; }
-        .stSidebar { font-size: 20px; }
-        .stTextInput, .stButton { font-size: 18px; }
-        .stTextArea { font-size: 16px; }
         </style>
         """,
-        unsafe_allow_html=True
     )
-    st.title("News Research Tool 📈")  # We set the title of the app.
-    #st.sidebar.image("img/news.png", width=150)  # We show a picture in the sidebar.
-    st.sidebar.title("News Article URL")  # We set a title in the sidebar for the URL input.
-    url = st.sidebar.text_input("Enter the news article URL 📰")  # We ask the user to input a web page URL.
-    process_url_clicked = st.sidebar.button("Process URL 🚀")  # We add a button to start processing the URL.
-    index_path = "faiss_index"  # This is the name of the file where we will save our index.
-    main_placeholder = st.empty()  # This creates a space where we can show different messages later.
-    # Initialize chat history in session state
-    if "chat_history" not in st.session_state:  # Check if we already have chat history saved.
-        st.session_state.chat_history = []  # If not, we create a new list to store it.
-    llm = initialize_model(api_key)  # We set up our chat model using the secret key.
-    if process_url_clicked:  # If the button to process the URL is clicked:
-        if not url:  # Check if the user didn't enter a URL.
             st.sidebar.error("Please provide a URL 😔")  # Show an error message.
-        elif not (url.startswith("http://") or url.startswith("https://")):  # Check if the URL is in the right format.
-            st.sidebar.error("Invalid URL format 🚫. Please include 'http://' or 'https://' 😕.")  # Show an error if not valid.
         else:  # If the URL is valid:
             try:
-                with st.spinner("Loading data from URL... 🤔"):  # Show a loading message while we wait.
-                    data = load_url_data(url)  # Get the text from the web page.
-                with st.spinner("Splitting text into chunks... ✂️"):  # Show another loading message.
-                    docs = split_documents(data)  # Break the text into smaller pieces.
-                with st.spinner("Creating embeddings... 🔍"):  # Show another loading message.
-                    create_faiss_index(docs, index_path)  # Create embeddings and save the index.
-                st.success("Data processed and index saved! You can now ask questions 😃")  # Show a success message.
-            except Exception as e:  # If something goes wrong:
-                logging.error(f"Error processing URL: {e}")  # Log the error for us to check later.
-                st.sidebar.error(f"Error processing URL: {e} 😞")  # Show an error in the sidebar.
                 st.error("Error occurred while processing the URL. Please try again. 😓")  # Show a general error message.
-    # Asking questions based on processed data
-    query = main_placeholder.chat_input("Ask a question from your news URL:")  # Ask the user for a question about the news.
-    if query:  # If the user has asked a question:
-        try:
-            if os.path.exists(index_path):  # Check if our saved index exists.
-                with st.spinner("Loading the FAISS index... 🗂️"):  # Show a loading message.
-                    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",api_key=api_key_google)  # Set up embeddings again.
-                    vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)  # Load our saved index.
-                with st.spinner("Retrieving answer... 🤖💡"):  # Show a loading message.
-                    chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(k=5))  # Set up a question-answering system.
-                    result = chain({"question": query}, return_only_outputs=True)  # Get the answer to the user's question.
-                # Store the question and answer in chat history
-                st.session_state.chat_history.append({"question": query, "answer": result["answer"]})  # Save the question and answer in chat history.
-                for entry in st.session_state.chat_history:  # Go through each entry in chat history:
-                    st.write(f"**Q:** {entry['question']}")  # Show the question.
-                    st.write(f"**A:** {entry['answer']}")  # Show the answer.
-                # Display sources if available
-                sources = result.get("sources", "")  # Get the sources for the answer.
-                if sources:  # If there are sources:
-                    st.subheader("Sources 📝:")  # Show a subheader for sources.
-                    sources_list = sources.split("\n")  # Split the sources by new lines.
-                    for source in sources_list:  # Go through each source:
-                        st.write(source)  # Show each source.
             else:
-                st.error("No index found. Please process a URL first 😔.")  # Show an error if the index doesn't exist.
-        except Exception as e:  # If something goes wrong while retrieving the answer:
-            logging.error(f"Error retrieving answer: {e}")  # Log the error.
-            st.error(f"Error retrieving answer: {e} 😢")  # Show an error message.
 # Run the main application function
-if __name__ == "__main__":  # Check if this file is being run directly:
-    main()  # Run the main function to start the app.

 # Import necessary libraries
 import os  # This library helps us work with files and folders on the computer.
+import streamlit as st  # Streamlit is a framework that helps us create interactive web apps.
+from dotenv import load_dotenv  # This library loads environment variables from a .env file, used for configuration.
+from langchain.chains import RetrievalQAWithSourcesChain  # A component that allows us to answer questions based on retrieved documents.
+from langchain.text_splitter import RecursiveCharacterTextSplitter  # Used to split large texts into smaller, manageable parts.
+from langchain.document_loaders import WebBaseLoader  # This is used to fetch text from web pages.
+from langchain_groq import ChatGroq  # A specific chat model for generating responses.
+from langchain_google_genai import GoogleGenerativeAIEmbeddings  # This creates embeddings for text to help with search and retrieval.
+from langchain.vectorstores import FAISS  # A library for storing and searching vector representations of documents.
+from langchain_core.prompts import ChatPromptTemplate  # For crafting prompts for chat interactions.
+from langchain_core.messages import AIMessage, HumanMessage  # To manage messages exchanged in the chat interface.
+import logging  # This library helps track errors and important events in the application.
+# Setup logging to capture important information and errors
+logging.basicConfig(level=logging.INFO)  # Set the logging level to INFO to record informative messages.
 # Load environment variables from the .env file
+load_dotenv()  # Load environment variables, which may include sensitive data like API keys.
+api_key = os.getenv("GROQ_API_KEY")  # Retrieve the API key for the ChatGroq model.
+api_key_google = os.getenv("GOOGLE_API_KEY")  # Retrieve the Google API key (if needed).
+# Function to initialize the ChatGroq model
 def initialize_model(api_key: str) -> ChatGroq:
     """This function sets up the ChatGroq model using the secret key."""
     return ChatGroq(
+        model="mixtral-8x7b-32768",  # Specify the model name for ChatGroq.
+        temperature=0,  # A temperature of 0 means responses are more deterministic (less random).
+        max_tokens=None,  # No limit on the number of tokens in the response.
+        timeout=None,  # No timeout for responses.
+        max_retries=5,  # The model will retry up to 5 times if it fails to generate a response.
+        api_key=api_key  # Use the provided API key for authentication.
     )
+# Function to load data from a given URL
 def load_url_data(url: str) -> list:
     """This function gets the text from a web page at the given URL."""
+    loader = WebBaseLoader([url])  # Create a loader to fetch text from the provided URL.
+    return loader.load()  # Load and return the content from that web page.
+# Function to split long documents into smaller chunks
 def split_documents(data: list) -> list:
     """This function breaks long pieces of text into smaller, manageable chunks."""
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1500,  # Each chunk will have a maximum size of 1500 characters.
+        separators=["\n", "\n\n", " ", ""],  # Various ways to split the text.
+        chunk_overlap=20  # Allow for 20 characters to overlap between chunks for context.
     )
+    return text_splitter.split_documents(data)  # Split the text and return the smaller parts.
+# Function to create embeddings and save them in a FAISS index
 def create_faiss_index(docs: list, index_path: str) -> None:
+    """This function makes embeddings from the text and saves them in a FAISS index."""
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")  # Create embeddings using the specified model.
+    vectorstore = FAISS.from_documents(docs, embeddings)  # Create a searchable index from the documents and their embeddings.
+    vectorstore.save_local(index_path)  # Save this index to a local file.
+    return vectorstore  # Return the created vectorstore for later use.
+# Function to generate a response from the AI based on user input
+def get_response(user_query: str, vectorstore, chat_history, llm) -> str:
+    """Generates a response from the AI model based on the user query."""
+    # Create a template for the chat prompt using the user query
+    prompt_template = ChatPromptTemplate(
+        messages=[HumanMessage(content=user_query)]  # Include the user's query in the message template.
+    )
+    # Set up the RetrievalQAWithSourcesChain to find answers based on the query
+    chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(k=5))  # Use the vectorstore to retrieve relevant documents.
+    result = chain({"question": user_query}, return_only_outputs=True)  # Get the answer from the chain.
+    return result["answer"]  # Return the generated answer.
 # Main application logic
 def main() -> None:
     """This is where our main app runs and does all the work."""
+    # Custom CSS for modifying the appearance of the app
     st.markdown(
         """
         <style>
+        .stTitle { font-size: 40px; }  # Set the font size for the title.
+        .stSidebar { font-size: 20px; }  # Set the font size for sidebar elements.
+        .stTextInput, .stButton { font-size: 18px; }  # Set font size for input fields and buttons.
+        .stTextArea { font-size: 16px; }  # Set font size for text areas.
         </style>
         """,
+        unsafe_allow_html=True  # Allow HTML for custom styling.
     )
+    st.title("News Research Tool 📈")  # Set the main title of the application.
+    st.sidebar.title("News Article URL")  # Title for the sidebar input.
+    url = st.sidebar.text_input("Enter the news article URL 📰")  # Prompt user to enter a news article URL.
+    process_url_clicked = st.sidebar.button("Process URL 🚀")  # Button to trigger URL processing.
+    index_path = "faiss_index"  # Filename for saving the FAISS index.
+    main_placeholder = st.empty()  # Create a placeholder for dynamic content in the main area.
+    # Initialize chat history and vectorstore in session state
+    if "chat_history" not in st.session_state:  # Check if chat history already exists.
+        st.session_state.chat_history = []  # If not, create a new list to store chat history.
+    if "vectorstore" not in st.session_state:  # Check if vectorstore is initialized.
+        st.session_state.vectorstore = None  # Initialize vectorstore to None.
+    llm = initialize_model(api_key)  # Initialize the chat model using the provided API key.
+    if process_url_clicked:  # If the user clicks the button to process the URL:
+        if not url:  # Check if the user did not enter a URL.
             st.sidebar.error("Please provide a URL 😔")  # Show an error message.
+        elif not (url.startswith("http://") or url.startswith("https://")):  # Check if the URL format is correct.
+            st.sidebar.error("Invalid URL format 🚫. Please include 'http://' or 'https://' 😕.")  # Show an error if the URL is invalid.
         else:  # If the URL is valid:
             try:
+                with st.spinner("Loading data from URL... 🤔"):  # Show a loading spinner while data is loading.
+                    data = load_url_data(url)  # Load data from the URL.
+                with st.spinner("Splitting text into chunks... ✂️"):  # Indicate that the text is being split.
+                    docs = split_documents(data)  # Split the text into smaller chunks.
+                with st.spinner("Creating embeddings... 🔍"):  # Indicate that embeddings are being created.
+                    vectorstore = create_faiss_index(docs, index_path)  # Create embeddings and save the index.
+                    st.session_state.vectorstore = vectorstore  # Store the vectorstore in session state.
+                st.success("Data processed and index saved! You can now ask questions 😃")  # Notify the user that processing is complete.
+            except Exception as e:  # If an error occurs during the process:
+                logging.error(f"Error processing URL: {e}")  # Log the error for debugging.
+                st.sidebar.error(f"Error processing URL: {e} 😞")  # Show an error message in the sidebar.
                 st.error("Error occurred while processing the URL. Please try again. 😓")  # Show a general error message.
+    # Display the chat history (both AI and user messages)
+    for message in st.session_state.chat_history:  # Iterate over the stored chat messages.
+        if isinstance(message, AIMessage):  # Check if the message is from the AI.
+            with st.chat_message("AI"):  # Display AI messages in the chat interface.
+                st.markdown(message.content)  # Render the AI message content.
+        elif isinstance(message, HumanMessage):  # Check if the message is from the user.
+            with st.chat_message("Human"):  # Display human messages in the chat interface.
+                st.markdown(message.content)  # Render the user message content.
+    # Input field for the user to type their message
+    user_query = st.chat_input("Type a message...")  # Input field for the user to enter their query.
+    if user_query and user_query.strip():  # If the user entered a valid, non-empty query:
+        st.session_state.chat_history.append(HumanMessage(content=user_query))  # Append the user query to chat history.
+        with st.chat_message("Human"):  # Display the user's message in the chat interface.
+            st.markdown(user_query)  # Render the user's message.
+        with st.chat_message("AI"):  # Generate and display the AI response.
+            if st.session_state.vectorstore:  # Ensure that the vectorstore has been initialized.
+                response = get_response(user_query, st.session_state.vectorstore, st.session_state.chat_history, llm)  # Get the AI's response.
             else:
+                response = "Please process a URL first before asking questions."  # Inform the user to process a URL first.
+            st.markdown(response)  # Show the AI's response.
+        st.session_state.chat_history.append(AIMessage(content=response))  # Add the AI's response to chat history.
 # Run the main application function
+if __name__ == "__main__":  # Check if this script is being run as the main program.
+    main()  # Start the application.