Spaces:

samiee2213
/

DataScribe

Build error

App Files Files Community

samiee2213 commited on Nov 13, 2024

Commit

14a0aaa

verified ·

1 Parent(s): 98915c7

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -109

app.py CHANGED Viewed

@@ -7,6 +7,9 @@ from googleapiclient.discovery import build
 from streamlit_chat import message as st_message
 import plotly.express as px
 import re
 import warnings
 import time
 from langchain.schema import HumanMessage, SystemMessage, AIMessage
@@ -18,10 +21,13 @@ from langchain.agents import initialize_agent, Tool
 from langchain.agents import AgentType
 from langchain_groq import ChatGroq
 import numpy as np
 from dotenv import load_dotenv
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 #environment
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
@@ -56,45 +62,70 @@ agent = initialize_agent(
 )
 # Function to perform the web search and get results
-def perform_web_search(query):
-    search_results = search.run(query)
-    return search_results
 # Function to get LLM response for dynamic queries
 def get_llm_response(entity, query, web_results):
     prompt = f"""
     Extract relevant {query} (e.g., email, phone number) from the following web results for the entity: {entity}.
     Web Results: {web_results}
     """
     human_message_content = f"""
     Entity: {entity}
     Query: {query}
     Web Results: {web_results}
     """
-    response = agent.invoke([system_message_content, human_message_content])
-    extracted_info = response.get("output", "Information not available").strip()
-    # Clean up irrelevant parts of the response
-    cleaned_info = re.sub(r"(Thought:|Action:)[^A-Za-z0-9]*", "", extracted_info).strip()
-    return cleaned_info
 # Retry logic for multiple web searches if necessary
 def refine_answer_with_searches(entity, query, max_retries=3):
     search_results = perform_web_search(query.format(entity=entity))
     extracted_answer = get_llm_response(entity, query, search_results)
-    retries = 0
-    while retries < max_retries:
-        if len(extracted_answer.split()) <= 2 or "not available" in extracted_answer.lower():
-            retries += 1
-            time.sleep(2)
-            search_results = perform_web_search(query.format(entity=entity))
-            extracted_answer = get_llm_response(entity, query, search_results)
-        else:
-            break
     return extracted_answer, search_results
 # Setup Google Sheets data fetch
@@ -122,24 +153,22 @@ with st.sidebar:
     )
 if selected == "Home":
     st.markdown("""
         <h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
-        <p style="text-align:center; font-size: 18px;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
     """, unsafe_allow_html=True)
     st.markdown("""---""")
     def feature_card(title, description, icon, page):
         col1, col2 = st.columns([1, 4])
         with col1:
-            st.markdown(f"<div style='font-size: 40px;'>{icon}</div>", unsafe_allow_html=True)
         with col2:
-            if st.button(f"{title}", key=title):
                 st.session_state.selected_page = page
-            st.write(description)
     col1, col2 = st.columns([1, 1])
     with col1:
@@ -183,7 +212,7 @@ elif selected == "Upload Data":
     if data_source == "CSV Files":
         if "data" in st.session_state:
             st.success("Data uploaded successfully! Here is a preview:")
-            st.dataframe(st.session_state["data"])
         else:
             uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
@@ -200,45 +229,69 @@ elif selected == "Upload Data":
                     full_data = pd.concat(dfs, ignore_index=True)
                     st.session_state["data"] = full_data
                     st.success("Data uploaded successfully! Here is a preview:")
-                    st.dataframe(full_data)
                 else:
                     st.warning("No valid data found in the uploaded files.")
     elif data_source == "Google Sheets":
         sheet_id = st.text_input("Enter Google Sheet ID")
         range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
-        if st.button("Fetch Data"):
-            try:
-                data = get_google_sheet_data(sheet_id, range_name)
-                st.session_state["data"] = data
-                st.write("Data fetched successfully. Here is a preview:")
-                st.dataframe(data)
-            except Exception as e:
-                st.error(f"Error fetching data: {e}")
 elif selected == "Define Query":
     st.header("Define Your Custom Query")
     if "data" not in st.session_state or st.session_state["data"] is None:
-        st.warning("Please upload data first!")
     else:
-        column = st.selectbox("Select entity column", st.session_state["data"].columns)
-        st.markdown(f"""
         <style>
         div[data-baseweb="select"] div[data-id="select"] {{
             background-color: #f0f8ff;
         }}
         </style>
         """, unsafe_allow_html=True)
         st.subheader("Define Fields to Extract")
-        num_fields = st.number_input("Number of fields to extract", min_value=1, value=1, step=1)
         fields = []
         for i in range(num_fields):
-            field = st.text_input(f"Field {i+1} name", key=f"field_{i}")
             if field:
                 fields.append(field)
@@ -246,7 +299,8 @@ elif selected == "Define Query":
             st.subheader("Query Template")
             query_template = st.text_area(
                 "Enter query template (Use '{entity}' to represent each entity)",
-                value=f"Find the {', '.join(fields)} for {{entity}}"
             )
             if "{entity}" in query_template:
@@ -256,11 +310,15 @@ elif selected == "Define Query":
                 st.code(example_query)
             if st.button("Save Query Configuration"):
-                st.session_state["column_selection"] = column
-                st.session_state["query_template"] = query_template
-                st.session_state["extraction_fields"] = fields
-                st.success("Query configuration saved!")
 elif selected == "Extract Information":
     st.header("Extract Information")
@@ -274,51 +332,41 @@ elif selected == "Extract Information":
         st.write("### Selected Entity Column:")
         st.dataframe(entities_column)
-        st.write("Data extraction is in progress. This may take a few moments.")
-        # Custom styled progress bar
-        progress_bar = st.progress(0)
-        # Custom CSS for a cute progress bar style
-        st.markdown("""
-        <style>
-        .stProgress > div {
-            background-color: #FFB6C1;  /* Light pink */
-            border-radius: 20px;
-            height: 15px;
-        }
-        </style>
-        """, unsafe_allow_html=True)
-        try:
-            results = []
-            for i, selected_entity in enumerate(entities_column):
-                user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
-                final_answer, search_results = refine_answer_with_searches(selected_entity, user_query)
-                results.append({
-                    "Entity": selected_entity,
-                    "Extracted Information": final_answer,
-                    "Search Results": search_results
-                })
-                # Update progress bar with a smooth and cute animation
-                progress_bar.progress(int((i + 1) / len(entities_column) * 100))
-            st.session_state["results"] = results
-            st.write("### Extracted Information")
-            for result in results:
-                st.write(f"**Entity:** {result['Entity']}")
-                st.write(f"**Extracted Information:** {result['Extracted Information']}")
-            st.write("### Web Results:")
-            for result in results:
-                st.write(result["Search Results"])
-        except Exception as e:
-            st.error(f"An error occurred while extracting information: {e}")
     else:
         st.warning("Please upload your data and define the query template.")
 elif selected == "View & Download":
     st.header("View & Download Results")
@@ -326,27 +374,58 @@ elif selected == "View & Download":
         results_df = pd.DataFrame(st.session_state["results"])
         st.write("### Results Preview")
         st.dataframe(results_df.style.applymap(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
-        st.download_button(
-            label="Download all results as CSV",
-            data=results_df.to_csv(index=False),
-            file_name="extracted_results.csv",
-            mime="text/csv"
         )
-        st.download_button(
-            label="Download Extracted Information as CSV",
-            data=results_df[["Entity", "Extracted Information"]].to_csv(index=False),
-            file_name="extracted_information.csv",
-            mime="text/csv"
-        )
         st.download_button(
-            label="Download Web Results as CSV",
-            data=results_df[["Entity", "Search Results"]].to_csv(index=False),
-            file_name="web_results.csv",
             mime="text/csv"
         )
     else:
-        st.warning("No results available to view. Please run the extraction process.")

 from streamlit_chat import message as st_message
 import plotly.express as px
 import re
+import streamlit as st
+import gspread
+from google.oauth2.service_account import Credentials
 import warnings
 import time
 from langchain.schema import HumanMessage, SystemMessage, AIMessage
 from langchain.agents import AgentType
 from langchain_groq import ChatGroq
 import numpy as np
+import gspread
 from dotenv import load_dotenv
 warnings.filterwarnings("ignore", category=DeprecationWarning)
+scopes = ["https://www.googleapis.com/auth/spreadsheets"]
+creds = Credentials.from_service_account_file("credentials.json", scopes=scopes)
+client = gspread.authorize(creds)
 #environment
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 )
 # Function to perform the web search and get results
+def perform_web_search(query, max_retries=3, delay=2):
+    retries = 0
+    while retries < max_retries:
+        try:
+            search_results = search.run(query)
+            return search_results
+        except Exception as e:
+            retries += 1
+            st.warning(f"Web search failed for query '{query}'. Retrying ({retries}/{max_retries})...")
+            time.sleep(delay)
+    st.error(f"Failed to perform web search for query '{query}' after {max_retries} retries.")
+    return "NaN"
+def update_google_sheet(sheet_id, range_name, data):
+    try:
+        # Define the Google Sheets API scope
+        scopes = ["https://www.googleapis.com/auth/spreadsheets"]
+        creds = Credentials.from_service_account_file("credentials.json", scopes=scopes)
+        client = gspread.authorize(creds)
+        # Open the Google Sheet and specify the worksheet
+        sheet = client.open_by_key(sheet_id).worksheet(range_name.split("!")[0])
+        # Prepare data for update
+        data_to_update = [data.columns.tolist()] + data.values.tolist()
+        # Clear the existing content in the specified range and update it with new data
+        sheet.clear()
+        sheet.update(range_name, data_to_update)
+        st.success("Data successfully updated in the Google Sheet!")
+    except Exception as e:
+        st.error(f"Error updating Google Sheet: {e}")
 # Function to get LLM response for dynamic queries
 def get_llm_response(entity, query, web_results):
     prompt = f"""
     Extract relevant {query} (e.g., email, phone number) from the following web results for the entity: {entity}.
     Web Results: {web_results}
     """
     human_message_content = f"""
     Entity: {entity}
     Query: {query}
     Web Results: {web_results}
     """
+    try:
+        response = agent.invoke([system_message_content, human_message_content], handle_parsing_errors=True)
+        extracted_info = response.get("output", "Information not available").strip()
+        # Clean up irrelevant parts of the response
+        cleaned_info = re.sub(r"(Thought:|Action:)[^A-Za-z0-9]*", "", extracted_info).strip()
+        return cleaned_info
+    except Exception as e:
+        return "NaN"
 # Retry logic for multiple web searches if necessary
 def refine_answer_with_searches(entity, query, max_retries=3):
     search_results = perform_web_search(query.format(entity=entity))
     extracted_answer = get_llm_response(entity, query, search_results)
+    if len(extracted_answer.split()) <= 2 or "not available" in extracted_answer.lower():
+        search_results = perform_web_search(query.format(entity=entity))
+        extracted_answer = get_llm_response(entity, query, search_results)
     return extracted_answer, search_results
 # Setup Google Sheets data fetch
     )
 if selected == "Home":
     st.markdown("""
         <h1 style="text-align:center; color:#4CAF50; font-size: 40px;">🚀 Welcome to DataScribe</h1>
+        <p style="text-align:center; font-size: 18px; color:#333;">An AI-powered information extraction tool to streamline data retrieval and analysis.</p>
     """, unsafe_allow_html=True)
     st.markdown("""---""")
     def feature_card(title, description, icon, page):
         col1, col2 = st.columns([1, 4])
         with col1:
+            st.markdown(f"<div style='font-size: 40px; text-align:center;'>{icon}</div>", unsafe_allow_html=True)
         with col2:
+            if st.button(f"{title}", key=title, help=description):
                 st.session_state.selected_page = page
+            st.markdown(f"<p style='font-size: 14px; color:#555;'>{description}</p>", unsafe_allow_html=True)
     col1, col2 = st.columns([1, 1])
     with col1:
     if data_source == "CSV Files":
         if "data" in st.session_state:
             st.success("Data uploaded successfully! Here is a preview:")
+            st.dataframe(st.session_state["data"].head(10))  # Display only the first 10 rows for a cleaner view
         else:
             uploaded_files = st.file_uploader("Upload your CSV files", type=["csv"], accept_multiple_files=True)
                     full_data = pd.concat(dfs, ignore_index=True)
                     st.session_state["data"] = full_data
                     st.success("Data uploaded successfully! Here is a preview:")
+                    st.dataframe(full_data.head(10))  # Show preview of first 10 rows
                 else:
                     st.warning("No valid data found in the uploaded files.")
+            if st.button("Clear Data"):
+                del st.session_state["data"]
+                st.success("Data has been cleared!")
     elif data_source == "Google Sheets":
         sheet_id = st.text_input("Enter Google Sheet ID")
         range_name = st.text_input("Enter the data range (e.g., Sheet1!A1:C100)")
+        if sheet_id and range_name:
+            if st.button("Fetch Data"):
+                with st.spinner("Fetching data from Google Sheets..."):
+                    try:
+                        data = get_google_sheet_data(sheet_id, range_name)
+                        st.session_state["data"] = data
+                        st.success("Data fetched successfully! Here is a preview:")
+                        st.dataframe(data.head(10))  # Show preview of first 10 rows
+                    except Exception as e:
+                        st.error(f"Error fetching data: {e}")
+        else:
+            st.warning("Please enter both Sheet ID and Range name before fetching data.")
 elif selected == "Define Query":
     st.header("Define Your Custom Query")
     if "data" not in st.session_state or st.session_state["data"] is None:
+        st.warning("Please upload data first! Use the 'Upload Data' section to upload your data.")
     else:
+        column = st.selectbox(
+            "Select entity column",
+            st.session_state["data"].columns,
+            help="Select the column that contains the entities for which you want to define queries."
+        )
+        st.markdown("""
         <style>
         div[data-baseweb="select"] div[data-id="select"] {{
             background-color: #f0f8ff;
         }}
         </style>
         """, unsafe_allow_html=True)
         st.subheader("Define Fields to Extract")
+        num_fields = st.number_input(
+            "Number of fields to extract",
+            min_value=1,
+            value=1,
+            step=1,
+            help="Specify how many fields you want to extract from each entity."
+        )
         fields = []
         for i in range(num_fields):
+            field = st.text_input(
+                f"Field {i+1} name",
+                key=f"field_{i}",
+                placeholder=f"Enter field name for {i+1}",
+                help="Name the field you want to extract from the entity."
+            )
             if field:
                 fields.append(field)
             st.subheader("Query Template")
             query_template = st.text_area(
                 "Enter query template (Use '{entity}' to represent each entity)",
+                value=f"Find the {', '.join(fields)} for {{entity}}",
+                help="You can use {entity} as a placeholder to represent each entity in the query."
             )
             if "{entity}" in query_template:
                 st.code(example_query)
             if st.button("Save Query Configuration"):
+                if not fields:
+                    st.error("Please define at least one field to extract.")
+                elif not query_template:
+                    st.error("Please enter a query template.")
+                else:
+                    st.session_state["column_selection"] = column
+                    st.session_state["query_template"] = query_template
+                    st.session_state["extraction_fields"] = fields
+                    st.success("Query configuration saved successfully!")
 elif selected == "Extract Information":
     st.header("Extract Information")
         st.write("### Selected Entity Column:")
         st.dataframe(entities_column)
+        if st.button("Start Extraction"):
+            st.write("Data extraction is in progress. This may take a few moments.")
+            # Custom styled progress bar
+            progress_bar = st.progress(0)
+            try:
+                results = []
+                for i, selected_entity in enumerate(entities_column):
+                    user_query = st.session_state["query_template"].replace("{entity}", str(selected_entity))
+                    final_answer, search_results = refine_answer_with_searches(selected_entity, user_query)
+                    results.append({
+                        "Entity": selected_entity,
+                        "Extracted Information": final_answer,
+                        "Search Results": search_results
+                    })
+                    # Update progress bar with a smooth and cute animation
+                    progress_bar.progress(int((i + 1) / len(entities_column) * 100))
+                st.session_state["results"] = results
+                st.write("### Extracted Information")
+                for result in results:
+                    st.write(f"**Entity:** {result['Entity']}")
+                    st.write(f"**Extracted Information:** {result['Extracted Information']}")
+                st.write("### Web Results:")
+                for result in results:
+                    st.write(result["Search Results"])
+            except Exception as e:
+                st.error(f"An error occurred while extracting information: {e}")
     else:
         st.warning("Please upload your data and define the query template.")
 elif selected == "View & Download":
     st.header("View & Download Results")
         results_df = pd.DataFrame(st.session_state["results"])
         st.write("### Results Preview")
+        # Display results with some background color for the relevant columns
         st.dataframe(results_df.style.applymap(lambda val: 'background-color: #d3f4ff' if isinstance(val, str) else '', subset=["Extracted Information", "Search Results"]))
+        download_option = st.selectbox(
+            "Select data to download:",
+            ["All Results", "Extracted Information", "Web Results"]
         )
+        if download_option == "All Results":
+            data_to_download = results_df
+        elif download_option == "Extracted Information":
+            data_to_download = results_df[["Entity", "Extracted Information"]]
+        elif download_option == "Web Results":
+            data_to_download = results_df[["Entity", "Search Results"]]
         st.download_button(
+            label=f"Download {download_option} as CSV",
+            data=data_to_download.to_csv(index=False),
+            file_name=f"{download_option.lower().replace(' ', '_')}.csv",
             mime="text/csv"
         )
+        # To ensure the inputs and button are persistent, store their values in session_state
+        if 'sheet_id' not in st.session_state:
+            st.session_state.sheet_id = ''
+        if 'range_name' not in st.session_state:
+            st.session_state.range_name = ''
+        sheet_id = st.text_input("Enter Google Sheet ID", value=st.session_state.sheet_id)
+        range_name = st.text_input("Enter Range (e.g., 'Sheet1!A1')", value=st.session_state.range_name)
+        if sheet_id and range_name:
+            st.session_state.sheet_id = sheet_id
+            st.session_state.range_name = range_name
+            # Define data_to_update to update the Google Sheet
+            data_to_update = [results_df.columns.tolist()] + results_df.values.tolist()
+            # Update Google Sheets button
+            if st.button("Update Google Sheet"):
+                try:
+                    if '!' not in range_name:
+                        st.error("Invalid range format. Please use the format 'SheetName!Range'.")
+                    else:
+                        sheet_name, cell_range = range_name.split('!', 1)
+                        sheet = client.open_by_key(sheet_id).worksheet(sheet_name)
+                        sheet.clear()  # Clear the existing data before updating
+                        sheet.update(f"{cell_range}", data_to_update)  # Update the data to the specified range
+                        st.success("Data updated in the Google Sheet!")
+                except Exception as e:
+                    st.error(f"Error updating Google Sheet: {e}")
+        else:
+            st.warning("Please enter both the Sheet ID and Range name before updating.")
     else:
+        st.warning("No results available to view. Please run the extraction process.")