Spaces:

poemsforaphrodite
/

finance_rag

Sleeping

App Files Files Community

poemsforaphrodite commited on Sep 11, 2024

Commit

63e34b3

verified ·

1 Parent(s): 212732e

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -97

app.py CHANGED Viewed

@@ -28,6 +28,10 @@ import numpy as np
 from pymongo import MongoClient
 import traceback
 from docx import Document
 load_dotenv()
@@ -361,97 +365,124 @@ def get_category_reports():
         ]
     }
-def analyze_excel_with_gpt(df, sheet_name, user_feedback, category, reports_needed):
-    # Convert Excel to PDF
-    pdf_path = excel_to_pdf(df)
-    # Extract text from PDF
-    pdf_text = pdf_to_text(pdf_path)
-    prompt = f"""Analyze the following Excel data from sheet '{sheet_name}':
-    {df.to_string()}
-    User's previous feedback and insights:
-    {user_feedback}
-    """
-    if category != "general":
-        prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
-        {', '.join(reports_needed)}
-        Please provide:
-        1. A comprehensive overview of the data focusing on the {category} category
-        2. Key observations and trends related to the required reports
-        3. Any anomalies, interesting patterns, or correlations relevant to the {category}
-        4. Suggestions for further analysis or visualization based on the required reports
-        5. Address any previous feedback or insights mentioned above, if applicable
-        Focus on providing a thorough analysis of all aspects of the data relevant to the {category} and the specified reports."""
-    else:
-        prompt += """Please provide a general analysis of the data, including:
-        1. A comprehensive overview of the data
-        2. Key observations and trends
-        3. Any anomalies, interesting patterns, or correlations
-        4. Suggestions for further analysis or visualization
-        5. Address any previous feedback or insights mentioned above, if applicable
-        Focus on providing a thorough analysis of all aspects of the data."""
-    response = client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=[
-            {"role": "system", "content": f"You are a data analyst expert in interpreting Excel data for {'general' if category == 'general' else category} analysis."},
-            {"role": "user", "content": prompt}
-        ]
-    )
-    return response.choices[0].message.content
-def analyze_document_with_gpt(document_text, user_feedback, category, reports_needed):
-    prompt = f"""Analyze the following document content:
-    {document_text}
-    User's previous feedback and insights:
-    {user_feedback}
-    """
-    if category != "general":
-        prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
-        {', '.join(reports_needed)}
-        Please provide:
-        1. A comprehensive overview of the content focusing on the {category} category
-        2. Key points and main ideas related to the required reports
-        3. Any interesting patterns or unique aspects relevant to the {category}
-        4. Suggestions for further analysis or insights based on the required reports
-        5. Any limitations of the analysis due to the document format or OCR process
-        6. Address any previous feedback or insights mentioned above, if applicable
-        Focus on providing a thorough analysis of all aspects of the content relevant to the {category} and the specified reports."""
-    else:
-        prompt += """Please provide a general analysis of the document content, including:
-        1. A comprehensive overview of the content
-        2. Key points and main ideas
-        3. Any interesting patterns or unique aspects
-        4. Suggestions for further analysis or insights
-        5. Any limitations of the analysis due to the document format or OCR process
-        6. Address any previous feedback or insights mentioned above, if applicable
-        Focus on providing a thorough analysis of all aspects of the content."""
-    response = client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=[
-            {"role": "system", "content": f"You are a data analyst expert in interpreting complex document content for {'general' if category == 'general' else category} analysis."},
-            {"role": "user", "content": prompt}
-        ]
-    )
-    return response.choices[0].message.content
 def process_uploaded_file(uploaded_file):
     file_type = uploaded_file.type
@@ -547,6 +578,67 @@ def process_challan_pdfs(pdf_files):
     df = pd.DataFrame(all_data)
     return df
 # Streamlit UI
 st.set_page_config(layout="wide")
 st.title("Document Processing, Chat, Excel Filling, and Analysis")
@@ -738,7 +830,6 @@ if st.session_state.user:
             if file_type is not None and content is not None:
                 if file_type == "excel":
                     dfs = content
-                    # Display a dropdown to select the sheet for analysis
                     sheet_names = list(dfs.keys())
                     selected_sheet = st.selectbox("Select a sheet for analysis", sheet_names)
@@ -746,14 +837,16 @@ if st.session_state.user:
                     st.write(f"Preview of {selected_sheet}:")
                     st.dataframe(df_to_analyze.head())
-                    # Store the DataFrame in session state
                     st.session_state.analyzed_data = df_to_analyze
                 elif file_type == "text":
                     st.write("Document content preview:")
                     preview_text = content[:500] + "..."
                     st.text(preview_text)  # Show first 500 characters
-                    # Store the text content in session state
                     st.session_state.analyzed_data = content
                 # Add category selection with "Default" option
@@ -765,32 +858,23 @@ if st.session_state.user:
                 if st.button("Analyze with GPT"):
                     with st.spinner("Analyzing data... This may take a while for large datasets."):
-                        # Get accumulated user feedback
                         user_feedback = get_user_feedback(st.session_state.user["_id"])
-                        # Get reports needed for the selected category (empty list for "Default")
                         reports_needed = get_category_reports().get(selected_category, [])
-                        # Modify the analysis prompt based on the selected category
-                        if selected_category == "Default":
-                            if file_type == "excel":
-                                analysis_result = analyze_excel_with_gpt(st.session_state.analyzed_data, selected_sheet, user_feedback, "general", [])
-                            else:  # PDF or Word document
-                                analysis_result = analyze_document_with_gpt(st.session_state.analyzed_data, user_feedback, "general", [])
-                        else:
-                            if file_type == "excel":
-                                analysis_result = analyze_excel_with_gpt(st.session_state.analyzed_data, selected_sheet, user_feedback, selected_category, reports_needed)
-                            else:  # PDF or Word document
                                 analysis_result = analyze_document_with_gpt(st.session_state.analyzed_data, user_feedback, selected_category, reports_needed)
                         st.markdown("## Analysis Results")
                         st.markdown(analysis_result)
-                        # Store the analysis result in session state
                         st.session_state.analysis_result = analysis_result
                         if file_type == "excel":
-                            # Provide download link for the Excel PDF
                             pdf_path = excel_to_pdf(st.session_state.analyzed_data)
                             with open(pdf_path, "rb") as pdf_file:
                                 pdf_bytes = pdf_file.read()
@@ -806,14 +890,11 @@ if st.session_state.user:
                 new_feedback = st.text_area("Provide feedback or additional insights about the analysis:")
                 if st.button("Submit Feedback"):
                     if new_feedback:
-                        # Get existing feedback
                         user = users_collection.find_one({"_id": st.session_state.user["_id"]})
                         existing_feedback = user.get("feedback", "")
-                        # Append new feedback to existing feedback
                         updated_feedback = f"{existing_feedback}\n{new_feedback}" if existing_feedback else new_feedback
-                        # Update the user's feedback in MongoDB
                         users_collection.update_one(
                             {"_id": st.session_state.user["_id"]},
                             {"$set": {"feedback": updated_feedback}}
@@ -856,7 +937,6 @@ if st.session_state.user:
                     st.write("Challan Data:")
                     st.dataframe(challan_df)
-                    # Provide download link for the Excel file
                     buffer = io.BytesIO()
                     with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
                         challan_df.to_excel(writer, index=False, sheet_name='Challan Data')

 from pymongo import MongoClient
 import traceback
 from docx import Document
+import pandas as pd
+import io
+import time
+import traceback
 load_dotenv()
         ]
     }
+def analyze_excel_with_gpt(df, sheet_name, user_feedback, category, reports_needed, use_assistants_api=False):
+    if use_assistants_api:
+        return process_excel_with_assistant(df, category, reports_needed, user_feedback)
+    else:
+        # Existing OCR-based analysis code
+        prompt = f"""Analyze the following Excel data from sheet '{sheet_name}':
+        {df.to_string()}
+        User's previous feedback and insights:
+        {user_feedback}
+        """
+        if category != "general":
+            prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
+            {', '.join(reports_needed)}
+            Please provide:
+            1. A comprehensive overview of the data focusing on the {category} category
+            2. Key observations and trends related to the required reports
+            3. Any anomalies, interesting patterns, or correlations relevant to the {category}
+            4. Suggestions for further analysis or visualization based on the required reports
+            5. Address any previous feedback or insights mentioned above, if applicable
+            Focus on providing a thorough analysis of all aspects of the data relevant to the {category} and the specified reports."""
+        else:
+            prompt += """Please provide a general analysis of the data, including:
+            1. A comprehensive overview of the data
+            2. Key observations and trends
+            3. Any anomalies, interesting patterns, or correlations
+            4. Suggestions for further analysis or visualization
+            5. Address any previous feedback or insights mentioned above, if applicable
+            Focus on providing a thorough analysis of all aspects of the data."""
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": f"You are a data analyst expert in interpreting Excel data for {'general' if category == 'general' else category} analysis."},
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return response.choices[0].message.content
+def analyze_document_with_gpt(document_content, user_feedback, category, reports_needed, use_assistants_api=False, file_id=None):
+    if use_assistants_api:
+        assistant = client.beta.assistants.create(
+            name="Document Analyzer",
+            instructions=f"You are a document analysis expert. Analyze the uploaded document and provide insights based on the category: {category}.",
+            model="gpt-4-1106-preview"
+        )
+        thread = client.beta.threads.create()
+        message = client.beta.threads.messages.create(
+            thread_id=thread.id,
+            role="user",
+            content=f"Analyze the document with file ID: {file_id}. Category: {category}. Required reports: {', '.join(reports_needed)}. User feedback: {user_feedback}",
+            file_ids=[file_id]
+        )
+        run = client.beta.threads.runs.create(
+            thread_id=thread.id,
+            assistant_id=assistant.id
+        )
+        while run.status != "completed":
+            run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
+            time.sleep(1)
+        messages = client.beta.threads.messages.list(thread_id=thread.id)
+        return messages.data[0].content[0].text.value
+    else:
+        # Existing OCR-based analysis code
+        prompt = f"""Analyze the following document content:
+        {document_content}
+        User's previous feedback and insights:
+        {user_feedback}
+        """
+        if category != "general":
+            prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
+            {', '.join(reports_needed)}
+            Please provide:
+            1. A comprehensive overview of the content focusing on the {category} category
+            2. Key points and main ideas related to the required reports
+            3. Any interesting patterns or unique aspects relevant to the {category}
+            4. Suggestions for further analysis or insights based on the required reports
+            5. Any limitations of the analysis due to the document format or OCR process
+            6. Address any previous feedback or insights mentioned above, if applicable
+            Focus on providing a thorough analysis of all aspects of the content relevant to the {category} and the specified reports."""
+        else:
+            prompt += """Please provide a general analysis of the document content, including:
+            1. A comprehensive overview of the content
+            2. Key points and main ideas
+            3. Any interesting patterns or unique aspects
+            4. Suggestions for further analysis or insights
+            5. Any limitations of the analysis due to the document format or OCR process
+            6. Address any previous feedback or insights mentioned above, if applicable
+            Focus on providing a thorough analysis of all aspects of the content."""
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": f"You are a data analyst expert in interpreting complex document content for {'general' if category == 'general' else category} analysis."},
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return response.choices[0].message.content
 def process_uploaded_file(uploaded_file):
     file_type = uploaded_file.type
     df = pd.DataFrame(all_data)
     return df
+def process_file_with_assistant(file, file_type, category, reports_needed, user_feedback):
+    print(f"Starting {file_type} processing with Assistant")
+    try:
+        # Upload the file to OpenAI
+        uploaded_file = client.files.create(
+            file=file,
+            purpose='assistants'
+        )
+        print(f"File uploaded successfully. File ID: {uploaded_file.id}")
+        # Create an assistant
+        assistant = client.beta.assistants.create(
+            name=f"{file_type} Analyzer",
+            instructions=f"You are an expert in analyzing {file_type} files, focusing on {category}. Provide insights and summaries of the content based on the following reports: {', '.join(reports_needed)}. Consider the user's previous feedback: {user_feedback}",
+            model="gpt-4o",
+            tools=[{"type": "file_search"}]
+        )
+        print(f"Assistant created. Assistant ID: {assistant.id}")
+        # Create a thread
+        thread = client.beta.threads.create()
+        print(f"Thread created. Thread ID: {thread.id}")
+        # Add a message to the thread with the file attachment
+        message = client.beta.threads.messages.create(
+            thread_id=thread.id,
+            role="user",
+            content=f"Please analyze this  file and provide insights for the {category} category, focusing on the following reports: {', '.join(reports_needed)}.",
+            attachments=[
+                {"file_id": uploaded_file.id, "tools": [{"type": "file_search"}]}
+            ]
+        )
+        print(f"Message added to thread. Message ID: {message.id}")
+        # Run the assistant
+        run = client.beta.threads.runs.create(
+            thread_id=thread.id,
+            assistant_id=assistant.id
+        )
+        print(f"Run created. Run ID: {run.id}")
+        # Wait for the run to complete
+        while run.status != 'completed':
+            run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
+            print(f"Run status: {run.status}")
+            time.sleep(1)
+        # Retrieve the messages
+        messages = client.beta.threads.messages.list(thread_id=thread.id)
+        # Extract the assistant's response
+        analysis_result = next((msg.content[0].text.value for msg in messages if msg.role == 'assistant'), None)
+        print(f"{file_type} analysis completed successfully")
+        return analysis_result
+    except Exception as e:
+        print(f"Error in process_file_with_assistant: {str(e)}")
+        print(traceback.format_exc())
+        return None
 # Streamlit UI
 st.set_page_config(layout="wide")
 st.title("Document Processing, Chat, Excel Filling, and Analysis")
             if file_type is not None and content is not None:
                 if file_type == "excel":
                     dfs = content
                     sheet_names = list(dfs.keys())
                     selected_sheet = st.selectbox("Select a sheet for analysis", sheet_names)
                     st.write(f"Preview of {selected_sheet}:")
                     st.dataframe(df_to_analyze.head())
                     st.session_state.analyzed_data = df_to_analyze
+                    analysis_method = "OCR"  # Default to OCR for Excel files
                 elif file_type == "text":
                     st.write("Document content preview:")
                     preview_text = content[:500] + "..."
                     st.text(preview_text)  # Show first 500 characters
+                    # Add option to choose between OCR and Assistants API for PDF/Word
+                    analysis_method = st.radio("Choose analysis method:", ("OCR", "OpenAI Assistants API"))
                     st.session_state.analyzed_data = content
                 # Add category selection with "Default" option
                 if st.button("Analyze with GPT"):
                     with st.spinner("Analyzing data... This may take a while for large datasets."):
                         user_feedback = get_user_feedback(st.session_state.user["_id"])
                         reports_needed = get_category_reports().get(selected_category, [])
+                        if file_type == "excel":
+                            analysis_result = analyze_excel_with_gpt(st.session_state.analyzed_data, selected_sheet, user_feedback, selected_category, reports_needed)
+                        else:  # PDF or Word document
+                            if analysis_method == "OpenAI Assistants API":
+                                analysis_result = process_file_with_assistant(uploaded_file, "PDF", selected_category, reports_needed, user_feedback)
+                            else:
                                 analysis_result = analyze_document_with_gpt(st.session_state.analyzed_data, user_feedback, selected_category, reports_needed)
                         st.markdown("## Analysis Results")
                         st.markdown(analysis_result)
                         st.session_state.analysis_result = analysis_result
                         if file_type == "excel":
                             pdf_path = excel_to_pdf(st.session_state.analyzed_data)
                             with open(pdf_path, "rb") as pdf_file:
                                 pdf_bytes = pdf_file.read()
                 new_feedback = st.text_area("Provide feedback or additional insights about the analysis:")
                 if st.button("Submit Feedback"):
                     if new_feedback:
                         user = users_collection.find_one({"_id": st.session_state.user["_id"]})
                         existing_feedback = user.get("feedback", "")
                         updated_feedback = f"{existing_feedback}\n{new_feedback}" if existing_feedback else new_feedback
                         users_collection.update_one(
                             {"_id": st.session_state.user["_id"]},
                             {"$set": {"feedback": updated_feedback}}
                     st.write("Challan Data:")
                     st.dataframe(challan_df)
                     buffer = io.BytesIO()
                     with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
                         challan_df.to_excel(writer, index=False, sheet_name='Challan Data')