poemsforaphrodite commited on
Commit
63e34b3
·
verified ·
1 Parent(s): 212732e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -97
app.py CHANGED
@@ -28,6 +28,10 @@ import numpy as np
28
  from pymongo import MongoClient
29
  import traceback
30
  from docx import Document
 
 
 
 
31
 
32
  load_dotenv()
33
 
@@ -361,97 +365,124 @@ def get_category_reports():
361
  ]
362
  }
363
 
364
- def analyze_excel_with_gpt(df, sheet_name, user_feedback, category, reports_needed):
365
- # Convert Excel to PDF
366
- pdf_path = excel_to_pdf(df)
367
-
368
- # Extract text from PDF
369
- pdf_text = pdf_to_text(pdf_path)
370
-
371
- prompt = f"""Analyze the following Excel data from sheet '{sheet_name}':
372
-
373
- {df.to_string()}
374
 
375
- User's previous feedback and insights:
376
- {user_feedback}
377
 
378
- """
 
379
 
380
- if category != "general":
381
- prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
382
- {', '.join(reports_needed)}
383
 
384
- Please provide:
385
- 1. A comprehensive overview of the data focusing on the {category} category
386
- 2. Key observations and trends related to the required reports
387
- 3. Any anomalies, interesting patterns, or correlations relevant to the {category}
388
- 4. Suggestions for further analysis or visualization based on the required reports
389
- 5. Address any previous feedback or insights mentioned above, if applicable
390
 
391
- Focus on providing a thorough analysis of all aspects of the data relevant to the {category} and the specified reports."""
392
- else:
393
- prompt += """Please provide a general analysis of the data, including:
394
- 1. A comprehensive overview of the data
395
- 2. Key observations and trends
396
- 3. Any anomalies, interesting patterns, or correlations
397
- 4. Suggestions for further analysis or visualization
398
- 5. Address any previous feedback or insights mentioned above, if applicable
399
 
400
- Focus on providing a thorough analysis of all aspects of the data."""
 
 
 
 
 
 
 
401
 
402
- response = client.chat.completions.create(
403
- model="gpt-4o-mini",
404
- messages=[
405
- {"role": "system", "content": f"You are a data analyst expert in interpreting Excel data for {'general' if category == 'general' else category} analysis."},
406
- {"role": "user", "content": prompt}
407
- ]
408
- )
409
-
410
- return response.choices[0].message.content
411
 
412
- def analyze_document_with_gpt(document_text, user_feedback, category, reports_needed):
413
- prompt = f"""Analyze the following document content:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
- {document_text}
416
 
417
- User's previous feedback and insights:
418
- {user_feedback}
419
 
420
- """
421
 
422
- if category != "general":
423
- prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
424
- {', '.join(reports_needed)}
425
 
426
- Please provide:
427
- 1. A comprehensive overview of the content focusing on the {category} category
428
- 2. Key points and main ideas related to the required reports
429
- 3. Any interesting patterns or unique aspects relevant to the {category}
430
- 4. Suggestions for further analysis or insights based on the required reports
431
- 5. Any limitations of the analysis due to the document format or OCR process
432
- 6. Address any previous feedback or insights mentioned above, if applicable
433
 
434
- Focus on providing a thorough analysis of all aspects of the content relevant to the {category} and the specified reports."""
435
- else:
436
- prompt += """Please provide a general analysis of the document content, including:
437
- 1. A comprehensive overview of the content
438
- 2. Key points and main ideas
439
- 3. Any interesting patterns or unique aspects
440
- 4. Suggestions for further analysis or insights
441
- 5. Any limitations of the analysis due to the document format or OCR process
442
- 6. Address any previous feedback or insights mentioned above, if applicable
443
 
444
- Focus on providing a thorough analysis of all aspects of the content."""
445
 
446
- response = client.chat.completions.create(
447
- model="gpt-4o-mini",
448
- messages=[
449
- {"role": "system", "content": f"You are a data analyst expert in interpreting complex document content for {'general' if category == 'general' else category} analysis."},
450
- {"role": "user", "content": prompt}
451
- ]
452
- )
453
-
454
- return response.choices[0].message.content
455
 
456
  def process_uploaded_file(uploaded_file):
457
  file_type = uploaded_file.type
@@ -547,6 +578,67 @@ def process_challan_pdfs(pdf_files):
547
  df = pd.DataFrame(all_data)
548
  return df
549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  # Streamlit UI
551
  st.set_page_config(layout="wide")
552
  st.title("Document Processing, Chat, Excel Filling, and Analysis")
@@ -738,7 +830,6 @@ if st.session_state.user:
738
  if file_type is not None and content is not None:
739
  if file_type == "excel":
740
  dfs = content
741
- # Display a dropdown to select the sheet for analysis
742
  sheet_names = list(dfs.keys())
743
  selected_sheet = st.selectbox("Select a sheet for analysis", sheet_names)
744
 
@@ -746,14 +837,16 @@ if st.session_state.user:
746
  st.write(f"Preview of {selected_sheet}:")
747
  st.dataframe(df_to_analyze.head())
748
 
749
- # Store the DataFrame in session state
750
  st.session_state.analyzed_data = df_to_analyze
 
751
  elif file_type == "text":
752
  st.write("Document content preview:")
753
  preview_text = content[:500] + "..."
754
  st.text(preview_text) # Show first 500 characters
755
 
756
- # Store the text content in session state
 
 
757
  st.session_state.analyzed_data = content
758
 
759
  # Add category selection with "Default" option
@@ -765,32 +858,23 @@ if st.session_state.user:
765
 
766
  if st.button("Analyze with GPT"):
767
  with st.spinner("Analyzing data... This may take a while for large datasets."):
768
- # Get accumulated user feedback
769
  user_feedback = get_user_feedback(st.session_state.user["_id"])
770
-
771
- # Get reports needed for the selected category (empty list for "Default")
772
  reports_needed = get_category_reports().get(selected_category, [])
773
 
774
- # Modify the analysis prompt based on the selected category
775
- if selected_category == "Default":
776
- if file_type == "excel":
777
- analysis_result = analyze_excel_with_gpt(st.session_state.analyzed_data, selected_sheet, user_feedback, "general", [])
778
- else: # PDF or Word document
779
- analysis_result = analyze_document_with_gpt(st.session_state.analyzed_data, user_feedback, "general", [])
780
- else:
781
- if file_type == "excel":
782
- analysis_result = analyze_excel_with_gpt(st.session_state.analyzed_data, selected_sheet, user_feedback, selected_category, reports_needed)
783
- else: # PDF or Word document
784
  analysis_result = analyze_document_with_gpt(st.session_state.analyzed_data, user_feedback, selected_category, reports_needed)
785
 
786
  st.markdown("## Analysis Results")
787
  st.markdown(analysis_result)
788
 
789
- # Store the analysis result in session state
790
  st.session_state.analysis_result = analysis_result
791
 
792
  if file_type == "excel":
793
- # Provide download link for the Excel PDF
794
  pdf_path = excel_to_pdf(st.session_state.analyzed_data)
795
  with open(pdf_path, "rb") as pdf_file:
796
  pdf_bytes = pdf_file.read()
@@ -806,14 +890,11 @@ if st.session_state.user:
806
  new_feedback = st.text_area("Provide feedback or additional insights about the analysis:")
807
  if st.button("Submit Feedback"):
808
  if new_feedback:
809
- # Get existing feedback
810
  user = users_collection.find_one({"_id": st.session_state.user["_id"]})
811
  existing_feedback = user.get("feedback", "")
812
 
813
- # Append new feedback to existing feedback
814
  updated_feedback = f"{existing_feedback}\n{new_feedback}" if existing_feedback else new_feedback
815
 
816
- # Update the user's feedback in MongoDB
817
  users_collection.update_one(
818
  {"_id": st.session_state.user["_id"]},
819
  {"$set": {"feedback": updated_feedback}}
@@ -856,7 +937,6 @@ if st.session_state.user:
856
  st.write("Challan Data:")
857
  st.dataframe(challan_df)
858
 
859
- # Provide download link for the Excel file
860
  buffer = io.BytesIO()
861
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
862
  challan_df.to_excel(writer, index=False, sheet_name='Challan Data')
 
28
  from pymongo import MongoClient
29
  import traceback
30
  from docx import Document
31
+ import pandas as pd
32
+ import io
33
+ import time
34
+ import traceback
35
 
36
  load_dotenv()
37
 
 
365
  ]
366
  }
367
 
368
+ def analyze_excel_with_gpt(df, sheet_name, user_feedback, category, reports_needed, use_assistants_api=False):
369
+ if use_assistants_api:
370
+ return process_excel_with_assistant(df, category, reports_needed, user_feedback)
371
+ else:
372
+ # Existing OCR-based analysis code
373
+ prompt = f"""Analyze the following Excel data from sheet '{sheet_name}':
 
 
 
 
374
 
375
+ {df.to_string()}
 
376
 
377
+ User's previous feedback and insights:
378
+ {user_feedback}
379
 
380
+ """
 
 
381
 
382
+ if category != "general":
383
+ prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
384
+ {', '.join(reports_needed)}
 
 
 
385
 
386
+ Please provide:
387
+ 1. A comprehensive overview of the data focusing on the {category} category
388
+ 2. Key observations and trends related to the required reports
389
+ 3. Any anomalies, interesting patterns, or correlations relevant to the {category}
390
+ 4. Suggestions for further analysis or visualization based on the required reports
391
+ 5. Address any previous feedback or insights mentioned above, if applicable
 
 
392
 
393
+ Focus on providing a thorough analysis of all aspects of the data relevant to the {category} and the specified reports."""
394
+ else:
395
+ prompt += """Please provide a general analysis of the data, including:
396
+ 1. A comprehensive overview of the data
397
+ 2. Key observations and trends
398
+ 3. Any anomalies, interesting patterns, or correlations
399
+ 4. Suggestions for further analysis or visualization
400
+ 5. Address any previous feedback or insights mentioned above, if applicable
401
 
402
+ Focus on providing a thorough analysis of all aspects of the data."""
 
 
 
 
 
 
 
 
403
 
404
+ response = client.chat.completions.create(
405
+ model="gpt-4o-mini",
406
+ messages=[
407
+ {"role": "system", "content": f"You are a data analyst expert in interpreting Excel data for {'general' if category == 'general' else category} analysis."},
408
+ {"role": "user", "content": prompt}
409
+ ]
410
+ )
411
+
412
+ return response.choices[0].message.content
413
+
414
+ def analyze_document_with_gpt(document_content, user_feedback, category, reports_needed, use_assistants_api=False, file_id=None):
415
+ if use_assistants_api:
416
+ assistant = client.beta.assistants.create(
417
+ name="Document Analyzer",
418
+ instructions=f"You are a document analysis expert. Analyze the uploaded document and provide insights based on the category: {category}.",
419
+ model="gpt-4-1106-preview"
420
+ )
421
+
422
+ thread = client.beta.threads.create()
423
+
424
+ message = client.beta.threads.messages.create(
425
+ thread_id=thread.id,
426
+ role="user",
427
+ content=f"Analyze the document with file ID: {file_id}. Category: {category}. Required reports: {', '.join(reports_needed)}. User feedback: {user_feedback}",
428
+ file_ids=[file_id]
429
+ )
430
+
431
+ run = client.beta.threads.runs.create(
432
+ thread_id=thread.id,
433
+ assistant_id=assistant.id
434
+ )
435
+
436
+ while run.status != "completed":
437
+ run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
438
+ time.sleep(1)
439
+
440
+ messages = client.beta.threads.messages.list(thread_id=thread.id)
441
+ return messages.data[0].content[0].text.value
442
+ else:
443
+ # Existing OCR-based analysis code
444
+ prompt = f"""Analyze the following document content:
445
 
446
+ {document_content}
447
 
448
+ User's previous feedback and insights:
449
+ {user_feedback}
450
 
451
+ """
452
 
453
+ if category != "general":
454
+ prompt += f"""Please provide analysis and insights based on the following required reports for the category '{category}':
455
+ {', '.join(reports_needed)}
456
 
457
+ Please provide:
458
+ 1. A comprehensive overview of the content focusing on the {category} category
459
+ 2. Key points and main ideas related to the required reports
460
+ 3. Any interesting patterns or unique aspects relevant to the {category}
461
+ 4. Suggestions for further analysis or insights based on the required reports
462
+ 5. Any limitations of the analysis due to the document format or OCR process
463
+ 6. Address any previous feedback or insights mentioned above, if applicable
464
 
465
+ Focus on providing a thorough analysis of all aspects of the content relevant to the {category} and the specified reports."""
466
+ else:
467
+ prompt += """Please provide a general analysis of the document content, including:
468
+ 1. A comprehensive overview of the content
469
+ 2. Key points and main ideas
470
+ 3. Any interesting patterns or unique aspects
471
+ 4. Suggestions for further analysis or insights
472
+ 5. Any limitations of the analysis due to the document format or OCR process
473
+ 6. Address any previous feedback or insights mentioned above, if applicable
474
 
475
+ Focus on providing a thorough analysis of all aspects of the content."""
476
 
477
+ response = client.chat.completions.create(
478
+ model="gpt-4o-mini",
479
+ messages=[
480
+ {"role": "system", "content": f"You are a data analyst expert in interpreting complex document content for {'general' if category == 'general' else category} analysis."},
481
+ {"role": "user", "content": prompt}
482
+ ]
483
+ )
484
+
485
+ return response.choices[0].message.content
486
 
487
  def process_uploaded_file(uploaded_file):
488
  file_type = uploaded_file.type
 
578
  df = pd.DataFrame(all_data)
579
  return df
580
 
581
+ def process_file_with_assistant(file, file_type, category, reports_needed, user_feedback):
582
+ print(f"Starting {file_type} processing with Assistant")
583
+ try:
584
+ # Upload the file to OpenAI
585
+ uploaded_file = client.files.create(
586
+ file=file,
587
+ purpose='assistants'
588
+ )
589
+ print(f"File uploaded successfully. File ID: {uploaded_file.id}")
590
+
591
+ # Create an assistant
592
+ assistant = client.beta.assistants.create(
593
+ name=f"{file_type} Analyzer",
594
+ instructions=f"You are an expert in analyzing {file_type} files, focusing on {category}. Provide insights and summaries of the content based on the following reports: {', '.join(reports_needed)}. Consider the user's previous feedback: {user_feedback}",
595
+ model="gpt-4o",
596
+ tools=[{"type": "file_search"}]
597
+ )
598
+ print(f"Assistant created. Assistant ID: {assistant.id}")
599
+
600
+ # Create a thread
601
+ thread = client.beta.threads.create()
602
+ print(f"Thread created. Thread ID: {thread.id}")
603
+
604
+ # Add a message to the thread with the file attachment
605
+ message = client.beta.threads.messages.create(
606
+ thread_id=thread.id,
607
+ role="user",
608
+ content=f"Please analyze this file and provide insights for the {category} category, focusing on the following reports: {', '.join(reports_needed)}.",
609
+ attachments=[
610
+ {"file_id": uploaded_file.id, "tools": [{"type": "file_search"}]}
611
+ ]
612
+ )
613
+ print(f"Message added to thread. Message ID: {message.id}")
614
+
615
+ # Run the assistant
616
+ run = client.beta.threads.runs.create(
617
+ thread_id=thread.id,
618
+ assistant_id=assistant.id
619
+ )
620
+ print(f"Run created. Run ID: {run.id}")
621
+
622
+ # Wait for the run to complete
623
+ while run.status != 'completed':
624
+ run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
625
+ print(f"Run status: {run.status}")
626
+ time.sleep(1)
627
+
628
+ # Retrieve the messages
629
+ messages = client.beta.threads.messages.list(thread_id=thread.id)
630
+
631
+ # Extract the assistant's response
632
+ analysis_result = next((msg.content[0].text.value for msg in messages if msg.role == 'assistant'), None)
633
+
634
+ print(f"{file_type} analysis completed successfully")
635
+ return analysis_result
636
+
637
+ except Exception as e:
638
+ print(f"Error in process_file_with_assistant: {str(e)}")
639
+ print(traceback.format_exc())
640
+ return None
641
+
642
  # Streamlit UI
643
  st.set_page_config(layout="wide")
644
  st.title("Document Processing, Chat, Excel Filling, and Analysis")
 
830
  if file_type is not None and content is not None:
831
  if file_type == "excel":
832
  dfs = content
 
833
  sheet_names = list(dfs.keys())
834
  selected_sheet = st.selectbox("Select a sheet for analysis", sheet_names)
835
 
 
837
  st.write(f"Preview of {selected_sheet}:")
838
  st.dataframe(df_to_analyze.head())
839
 
 
840
  st.session_state.analyzed_data = df_to_analyze
841
+ analysis_method = "OCR" # Default to OCR for Excel files
842
  elif file_type == "text":
843
  st.write("Document content preview:")
844
  preview_text = content[:500] + "..."
845
  st.text(preview_text) # Show first 500 characters
846
 
847
+ # Add option to choose between OCR and Assistants API for PDF/Word
848
+ analysis_method = st.radio("Choose analysis method:", ("OCR", "OpenAI Assistants API"))
849
+
850
  st.session_state.analyzed_data = content
851
 
852
  # Add category selection with "Default" option
 
858
 
859
  if st.button("Analyze with GPT"):
860
  with st.spinner("Analyzing data... This may take a while for large datasets."):
 
861
  user_feedback = get_user_feedback(st.session_state.user["_id"])
 
 
862
  reports_needed = get_category_reports().get(selected_category, [])
863
 
864
+ if file_type == "excel":
865
+ analysis_result = analyze_excel_with_gpt(st.session_state.analyzed_data, selected_sheet, user_feedback, selected_category, reports_needed)
866
+ else: # PDF or Word document
867
+ if analysis_method == "OpenAI Assistants API":
868
+ analysis_result = process_file_with_assistant(uploaded_file, "PDF", selected_category, reports_needed, user_feedback)
869
+ else:
 
 
 
 
870
  analysis_result = analyze_document_with_gpt(st.session_state.analyzed_data, user_feedback, selected_category, reports_needed)
871
 
872
  st.markdown("## Analysis Results")
873
  st.markdown(analysis_result)
874
 
 
875
  st.session_state.analysis_result = analysis_result
876
 
877
  if file_type == "excel":
 
878
  pdf_path = excel_to_pdf(st.session_state.analyzed_data)
879
  with open(pdf_path, "rb") as pdf_file:
880
  pdf_bytes = pdf_file.read()
 
890
  new_feedback = st.text_area("Provide feedback or additional insights about the analysis:")
891
  if st.button("Submit Feedback"):
892
  if new_feedback:
 
893
  user = users_collection.find_one({"_id": st.session_state.user["_id"]})
894
  existing_feedback = user.get("feedback", "")
895
 
 
896
  updated_feedback = f"{existing_feedback}\n{new_feedback}" if existing_feedback else new_feedback
897
 
 
898
  users_collection.update_one(
899
  {"_id": st.session_state.user["_id"]},
900
  {"$set": {"feedback": updated_feedback}}
 
937
  st.write("Challan Data:")
938
  st.dataframe(challan_df)
939
 
 
940
  buffer = io.BytesIO()
941
  with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
942
  challan_df.to_excel(writer, index=False, sheet_name='Challan Data')