Ronio Jerico Roque commited on
Commit
c650b65
·
1 Parent(s): d3fe99b

Add WebsiteAudienceAcquisition class and integrate into analysis workflow; refactor upload handling in uploadFile

Browse files
classes/Off_Page.py CHANGED
@@ -3,14 +3,12 @@ from urllib.parse import urlparse
3
  import streamlit as st
4
  import requests
5
  from dotenv import load_dotenv
6
- import os
7
  import time
8
  from helper.telemetry import collect_telemetry
9
  from helper.upload_File import uploadFile
10
  from helper.button_behaviour import hide_button, unhide_button
11
  from helper.initialize_analyze_session import initialize_analyze_session
12
  import pandas as pd
13
- import asyncio
14
  import json
15
 
16
  class SeoOffPageAnalyst:
@@ -32,6 +30,10 @@ class SeoOffPageAnalyst:
32
  #st.header(self.analyst_name)
33
  if 'off_page_file_uploaded' not in st.session_state:
34
  st.session_state['off_page_file_uploaded'] = ''
 
 
 
 
35
 
36
  def request_model(self, payload_txt, headers):
37
  response = requests.post(self.model_url, json=payload_txt, headers=headers)
@@ -48,32 +50,64 @@ class SeoOffPageAnalyst:
48
  def process(self):
49
  start_time = time.time()
50
  session = st.session_state['analyze']
51
- if self.uploaded_files and session == 'clicked':
52
  combined_text = ""
 
53
  with st.spinner('SEO Off Page Analyst...', show_time=True):
54
  st.write('')
55
- for file_info in st.session_state['uploaded_files'].values():
56
- '''
57
- if file_info['type'] == 'pdf':
58
- combined_text += file_info['content'] + "\n"
59
- '''
60
  try:
61
- if file_info['type'] == 'csv':
62
- # Load CSV
63
- df = pd.read_csv(StringIO(file_info['content'].to_csv(index=True)))
64
-
65
- # Count total rows
66
- num_rows = len(df)
67
-
68
- # Extract unique domains from 'Source url'
69
- df['Source Domain'] = df['Source url'].apply(lambda x: urlparse(x).netloc)
70
- unique_domains = df['Source Domain'].nunique()
71
 
72
- combined_text += f"Total Backlinks Count: {num_rows}\n"
73
- combined_text += f"Referring Domain: {unique_domains}"
74
- st.info("Backlinks - SEMRush Uploaded Successfuly", icon="ℹ️")
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  except KeyError:
76
- st.info("Incorrect CSV format. Please upload a valid CSV file.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # OUTPUT FOR SEO ANALYST
78
 
79
  #result = self.request_model(payload_txt, headers)
@@ -82,19 +116,31 @@ class SeoOffPageAnalyst:
82
  #time_lapsed = end_time - start_time
83
 
84
  debug_info = {'data_field' : 'Backlinks', 'result': combined_text}
 
85
  #debug_info = {'url_uuid': self.model_url.split("-")[-1],'time_lapsed' : time_lapsed, 'files': [*st.session_state['uploaded_files']],'payload': payload_txt, 'result': result}
86
- collect_telemetry(debug_info)
87
 
88
- st.session_state["off_page_file_uploaded"] = 'uploaded'
 
 
 
 
 
 
89
 
90
  #with st.expander("Debug information", icon="⚙"):
91
  # st.write(debug_info)
92
  st.session_state['analyzing'] = False
 
93
  def row1(self):
94
  #st.write(self.data_src)
95
  self.uploaded_files = st.file_uploader('Backlinks - SEMRush', type='csv', accept_multiple_files=True, key="seo_off")
 
 
 
96
  if self.uploaded_files:
97
  upload.multiple_upload_file(self.uploaded_files)
 
 
98
 
99
  #st.write("") # FOR THE HIDE BUTTON
100
  #st.write("") # FOR THE HIDE BUTTON
 
3
  import streamlit as st
4
  import requests
5
  from dotenv import load_dotenv
 
6
  import time
7
  from helper.telemetry import collect_telemetry
8
  from helper.upload_File import uploadFile
9
  from helper.button_behaviour import hide_button, unhide_button
10
  from helper.initialize_analyze_session import initialize_analyze_session
11
  import pandas as pd
 
12
  import json
13
 
14
  class SeoOffPageAnalyst:
 
30
  #st.header(self.analyst_name)
31
  if 'off_page_file_uploaded' not in st.session_state:
32
  st.session_state['off_page_file_uploaded'] = ''
33
+ if 'website_audience' not in st.session_state:
34
+ st.session_state['website_audience'] = ''
35
+ if 'uploaded_files' not in st.session_state:
36
+ st.session_state['uploaded_files'] = ''
37
 
38
  def request_model(self, payload_txt, headers):
39
  response = requests.post(self.model_url, json=payload_txt, headers=headers)
 
50
  def process(self):
51
  start_time = time.time()
52
  session = st.session_state['analyze']
53
+ if (self.uploaded_files or self.website_audience) and session == 'clicked':
54
  combined_text = ""
55
+ website_audience = ""
56
  with st.spinner('SEO Off Page Analyst...', show_time=True):
57
  st.write('')
 
 
 
 
 
58
  try:
59
+ for file_info in st.session_state['uploaded_files'].values():
60
+ '''
61
+ if file_info['type'] == 'pdf':
62
+ combined_text += file_info['content'] + "\n"
63
+ '''
64
+ try:
65
+ if file_info['type'] == 'csv':
66
+ # Load CSV
67
+ df = pd.read_csv(StringIO(file_info['content'].to_csv(index=True)))
 
68
 
69
+ # Count total rows
70
+ num_rows = len(df)
71
+
72
+ # Extract unique domains from 'Source url'
73
+ df['Source Domain'] = df['Source url'].apply(lambda x: urlparse(x).netloc)
74
+ unique_domains = df['Source Domain'].nunique()
75
+
76
+ combined_text += f"Total Backlinks Count: {num_rows}\n"
77
+ combined_text += f"Referring Domain: {unique_domains}"
78
+ st.info("Backlinks - SEMRush Uploaded Successfuly", icon="ℹ️")
79
+ except KeyError:
80
+ st.info("Incorrect CSV format. Please upload a valid CSV file.")
81
+ except UnboundLocalError:
82
+ pass
83
+ except AttributeError:
84
+ pass
85
  except KeyError:
86
+ pass
87
+
88
+ try:
89
+ # Check if upload_website_audience exists in session state and is a dictionary
90
+ if 'upload_website_audience' in st.session_state and isinstance(st.session_state['upload_website_audience'], dict):
91
+ for file_name, file_info in st.session_state['upload_website_audience'].items():
92
+ try:
93
+ if file_info['type'] == 'csv':
94
+ # Since file_info['content'] is already a DataFrame (from your earlier code)
95
+ # No need to convert back from string to DataFrame
96
+ df = file_info['content']
97
+
98
+ # Process your DataFrame here
99
+ # Instead of reading from StringIO, just use the DataFrame directly
100
+ website_audience += f"Website Audience Acquisition {df}\n"
101
+
102
+ st.info("Website Audience Acquisition Uploaded Successfully", icon="ℹ️")
103
+ except KeyError:
104
+ st.info(f"Incorrect format for {file_name}. Please upload a valid CSV file.")
105
+ else:
106
+ st.info("No website audience data available. Please upload CSV files first.")
107
+ except Exception as e:
108
+ st.error(f"Error processing data: {str(e)}")
109
+
110
+
111
  # OUTPUT FOR SEO ANALYST
112
 
113
  #result = self.request_model(payload_txt, headers)
 
116
  #time_lapsed = end_time - start_time
117
 
118
  debug_info = {'data_field' : 'Backlinks', 'result': combined_text}
119
+ debug_info_website_audience = {'data_field' : 'Website Audience Acquisition', 'result': website_audience}
120
  #debug_info = {'url_uuid': self.model_url.split("-")[-1],'time_lapsed' : time_lapsed, 'files': [*st.session_state['uploaded_files']],'payload': payload_txt, 'result': result}
 
121
 
122
+ if self.uploaded_files:
123
+ st.session_state['off_page_file_uploaded'] = 'uploaded'
124
+ collect_telemetry(debug_info)
125
+ if self.website_audience:
126
+ st.session_state['website_audience'] = 'uploaded'
127
+ collect_telemetry(debug_info_website_audience)
128
+
129
 
130
  #with st.expander("Debug information", icon="⚙"):
131
  # st.write(debug_info)
132
  st.session_state['analyzing'] = False
133
+
134
  def row1(self):
135
  #st.write(self.data_src)
136
  self.uploaded_files = st.file_uploader('Backlinks - SEMRush', type='csv', accept_multiple_files=True, key="seo_off")
137
+ self.website_audience = st.file_uploader('Website Audience Acquisition - GA4', type='csv', accept_multiple_files=True, key="website_audiences")
138
+ #self.website_audience = st.text_input("Website Audience Acquisition:", placeholder='Enter Website Audience Acquisition')
139
+
140
  if self.uploaded_files:
141
  upload.multiple_upload_file(self.uploaded_files)
142
+ if self.website_audience:
143
+ upload.upload_website_audience(self.website_audience)
144
 
145
  #st.write("") # FOR THE HIDE BUTTON
146
  #st.write("") # FOR THE HIDE BUTTON
classes/response_conversion_analyst.py CHANGED
@@ -28,7 +28,6 @@ class ConversionAnalyst:
28
  def request_model(self, payload_txt, headers):
29
  response = requests.post(self.model_url, json=payload_txt, headers=headers)
30
  response.raise_for_status()
31
- print(response)
32
  output = response.json()
33
  #st.write(output)
34
  text = output["outputs"][0]["outputs"][0]["results"]["text"]["data"]["text"]
 
28
  def request_model(self, payload_txt, headers):
29
  response = requests.post(self.model_url, json=payload_txt, headers=headers)
30
  response.raise_for_status()
 
31
  output = response.json()
32
  #st.write(output)
33
  text = output["outputs"][0]["outputs"][0]["results"]["text"]["data"]["text"]
classes/response_desired_outcome.py CHANGED
@@ -28,7 +28,6 @@ class DesiredOutcome:
28
  def request_model(self, payload_txt, headers):
29
  response = requests.post(self.model_url, json=payload_txt, headers=headers)
30
  response.raise_for_status()
31
- print(response)
32
  output = response.json()
33
  #st.write(output)
34
  text = output["outputs"][0]["outputs"][0]["results"]["text"]["data"]["text"]
 
28
  def request_model(self, payload_txt, headers):
29
  response = requests.post(self.model_url, json=payload_txt, headers=headers)
30
  response.raise_for_status()
 
31
  output = response.json()
32
  #st.write(output)
33
  text = output["outputs"][0]["outputs"][0]["results"]["text"]["data"]["text"]
classes/response_website_audience_acquisition.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from dotenv import load_dotenv
4
+ import os
5
+ from helper.upload_response import upload_response
6
+ from helper.upload_File import uploadFile
7
+ from pymongo import MongoClient
8
+ import json
9
+
10
+ class WebsiteAudienceAcquisition:
11
+ def __init__(self, model_url):
12
+ self.uploaded_files = []
13
+ self.file_dict = {}
14
+ self.model_url = model_url
15
+ #self.analyst_name = analyst_name
16
+ #self.data_src = data_src
17
+ #self.analyst_description = analyst_description
18
+ self.initialize()
19
+ self.row1()
20
+
21
+ def initialize(self):
22
+ # FOR ENV
23
+ load_dotenv()
24
+
25
+ # AGENT NAME
26
+ #st.header(self.analyst_name)
27
+
28
+ def request_model(self, payload_txt, headers):
29
+ response = requests.post(self.model_url, json=payload_txt, headers=headers)
30
+ response.raise_for_status()
31
+ output = response.json()
32
+ #st.write(output)
33
+ text = output["outputs"][0]["outputs"][0]["results"]["text"]["data"]["text"]
34
+ #text = json.loads(text)
35
+ #st.write(text)
36
+ return text
37
+
38
+ def fetch_data(self, data_field):
39
+ mongodb_uri = os.getenv("MONGODB_URI")
40
+ myclient = MongoClient(mongodb_uri)
41
+ mydb = myclient.get_database()
42
+ mycol = mydb["df_data"]
43
+
44
+ # Sort by timestamp field in descending order
45
+ x = mycol.find_one(
46
+ {"data_field": data_field},
47
+ sort=[("timestamp", -1)]
48
+ )
49
+
50
+ x = x["result"]
51
+ return x
52
+
53
+ def process(self):
54
+ with st.spinner('Website Audience Acquisition...', show_time=True):
55
+ st.write('')
56
+ headers = {"Content-Type": "application/json", "x-api-key": f"{os.getenv('x-api-key')}"}
57
+ try:
58
+ payload_txt = {"input_value": self.payload, "output_type": "text", "input_type": "chat"}
59
+ payload_txt_model = self.request_model(payload_txt, headers)
60
+ debug_info = {'data_field' : 'Website Audience Acquisition', 'result': payload_txt_model}
61
+ upload_response(debug_info)
62
+ st.session_state['website_audience'] = ''
63
+
64
+ count = 0
65
+ except Exception as e:
66
+ pass
67
+ st.session_state['analyzing'] = False
68
+
69
+ def row1(self):
70
+ st.session_state['analyzing'] = False
71
+ self.payload = ""
72
+ count = 0
73
+ try:
74
+ session_content_outside_the_website = st.session_state['website_audience']
75
+ if session_content_outside_the_website == 'uploaded':
76
+ count += 1
77
+ self.payload += self.fetch_data("Website Audience Acquisition")
78
+ except Exception as e:
79
+ pass
80
+
81
+ if count >= 1:
82
+ name = self.fetch_data("Client Name")
83
+ website = self.fetch_data("Client Website")
84
+ self.payload = name + website + self.payload
85
+ self.process()
86
+
87
+
88
+ if __name__ == "__main__":
89
+ st.set_page_config(layout="wide")
90
+
91
+ upload = uploadFile()
helper/upload_File.py CHANGED
@@ -28,6 +28,48 @@ class uploadFile:
28
  pass
29
 
30
  st.session_state['uploaded_files'] = self.file_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def upload_file_seo(self, uploaded_files):
33
  for _ in range(len(self.file_dict)):
 
28
  pass
29
 
30
  st.session_state['uploaded_files'] = self.file_dict
31
+
32
+ def upload_website_audience(self, uploaded_files):
33
+ for _ in range(len(self.file_dict)):
34
+ self.file_dict.popitem()
35
+
36
+ for uploaded_file in uploaded_files:
37
+ if uploaded_file.type == "application/pdf":
38
+ try:
39
+ with pymupdf.open(stream=uploaded_file.read(), filetype="pdf") as doc:
40
+ text = chr(12).join([page.get_text() for page in doc])
41
+ self.file_dict[uploaded_file.name] = {'type': 'pdf', 'content': text}
42
+ except Exception:
43
+ pass
44
+ elif uploaded_file.type == "text/csv":
45
+ try:
46
+ # Skip comment lines that start with #
47
+ df = pd.read_csv(
48
+ uploaded_file,
49
+ comment='#', # Treat lines starting with # as comments
50
+ engine='python' # Use more flexible engine
51
+ )
52
+ self.file_dict[uploaded_file.name] = {'type': 'csv', 'content': df}
53
+ except Exception as e:
54
+ print(f"Error processing CSV: {str(e)}")
55
+ # If that fails, you could try a more manual approach
56
+ try:
57
+ uploaded_file.seek(0)
58
+ raw_text = uploaded_file.read().decode('utf-8')
59
+ # Get only non-comment lines
60
+ data_lines = [line for line in raw_text.split('\n') if not line.strip().startswith('#')]
61
+
62
+ # Use StringIO to create a file-like object from the filtered lines
63
+ from io import StringIO
64
+ csv_data = StringIO('\n'.join(data_lines))
65
+
66
+ # Read from the filtered data
67
+ df = pd.read_csv(csv_data)
68
+ self.file_dict[uploaded_file.name] = {'type': 'csv', 'content': df}
69
+ except Exception as e:
70
+ print(f"Second attempt failed: {str(e)}")
71
+
72
+ st.session_state['upload_website_audience'] = self.file_dict
73
 
74
  def upload_file_seo(self, uploaded_files):
75
  for _ in range(len(self.file_dict)):
pages/analyzing_page.py CHANGED
@@ -17,6 +17,7 @@ from classes.response_target_market import TargetMarket
17
  from classes.response_df_overview import dfOverview
18
  from classes.response_desired_outcome import DesiredOutcome
19
  from classes.response_conversion_analyst import ConversionAnalyst
 
20
  from classes.response_executive_summary import ExecutiveSummary
21
  from classes.response_snapshot import Snapshot
22
 
@@ -72,10 +73,10 @@ def run_analysis():
72
  "df_overview": st.empty(),
73
  "desired_outcome": st.empty(),
74
  "conversion": st.empty(),
 
75
  "snapshot": st.empty(),
76
  "executive_summary": st.empty(),
77
 
78
-
79
  }
80
 
81
  # Create thread-safe handlers for each analysis type
@@ -237,6 +238,17 @@ def run_analysis():
237
  handler.update_error(f"Conversion Analysis failed: {str(e)}")
238
  return None
239
 
 
 
 
 
 
 
 
 
 
 
 
240
  def run_snapshot_analysis():
241
  handler = handlers["snapshot"]
242
  try:
@@ -274,7 +286,8 @@ def run_analysis():
274
  (run_target_market_analysis, "target_market"),
275
  (run_df_overview_analysis, "df_overview"),
276
  (run_desired_outcomes_analysis, "desired_outcome"),
277
- (run_conversion_analysis, "conversion")
 
278
  ]
279
 
280
  # Create and start first batch threads with small delays to prevent UI conflicts
 
17
  from classes.response_df_overview import dfOverview
18
  from classes.response_desired_outcome import DesiredOutcome
19
  from classes.response_conversion_analyst import ConversionAnalyst
20
+ from classes.response_website_audience_acquisition import WebsiteAudienceAcquisition
21
  from classes.response_executive_summary import ExecutiveSummary
22
  from classes.response_snapshot import Snapshot
23
 
 
73
  "df_overview": st.empty(),
74
  "desired_outcome": st.empty(),
75
  "conversion": st.empty(),
76
+ "website_audience": st.empty(),
77
  "snapshot": st.empty(),
78
  "executive_summary": st.empty(),
79
 
 
80
  }
81
 
82
  # Create thread-safe handlers for each analysis type
 
238
  handler.update_error(f"Conversion Analysis failed: {str(e)}")
239
  return None
240
 
241
+ def run_website_audience():
242
+ handler = handlers["website_audience"]
243
+ try:
244
+ handler.update_info("Running Website Audience Acquisition Analysis...")
245
+ result = WebsiteAudienceAcquisition(os.getenv('Model_Website_Audience_Acquisition_Analyst'))
246
+ handler.update_success("Website Audience Acquisition Analysis completed successfully.")
247
+ return result
248
+ except Exception as e:
249
+ handler.update_error(f"Website Audience Acquisition Analysis failed: {str(e)}")
250
+ return None
251
+
252
  def run_snapshot_analysis():
253
  handler = handlers["snapshot"]
254
  try:
 
286
  (run_target_market_analysis, "target_market"),
287
  (run_df_overview_analysis, "df_overview"),
288
  (run_desired_outcomes_analysis, "desired_outcome"),
289
+ (run_conversion_analysis, "conversion"),
290
+ (run_website_audience, "website_audience")
291
  ]
292
 
293
  # Create and start first batch threads with small delays to prevent UI conflicts
pages/home.py CHANGED
@@ -65,8 +65,7 @@ class DigitalFootprintDashboard:
65
 
66
  self.client_summary = CientSummary()
67
 
68
-
69
-
70
  with col2:
71
  st.write("## Website Traffic")
72
  self.backlinks = SeoOffPageAnalyst(os.getenv('MODEL_Off_Page_Analyst'))
 
65
 
66
  self.client_summary = CientSummary()
67
 
68
+
 
69
  with col2:
70
  st.write("## Website Traffic")
71
  self.backlinks = SeoOffPageAnalyst(os.getenv('MODEL_Off_Page_Analyst'))