itsOwen commited on
Commit
65cb2b8
·
1 Parent(s): 583638c

several fixes, removed logging

Browse files
app/ui_components.py CHANGED
@@ -41,7 +41,7 @@ def display_info_icons():
41
  if time.time() - st.session_state.info_icons_time > 10 or ("messages" in st.session_state and len(st.session_state.messages) > 0):
42
  st.session_state.info_icons_displayed = False
43
 
44
- def extract_data_from_markdown(text: Union[str, bytes, io.BytesIO]) -> Union[str, bytes, None]:
45
  if isinstance(text, io.BytesIO):
46
  return text
47
  if isinstance(text, bytes):
@@ -49,16 +49,24 @@ def extract_data_from_markdown(text: Union[str, bytes, io.BytesIO]) -> Union[str
49
  pattern = r'```(csv|excel)\n(.*?)\n```'
50
  match = re.search(pattern, text, re.DOTALL)
51
  if match:
52
- return match.group(2).strip()
 
 
 
 
53
  return None
54
 
55
  def format_data(data: Union[str, bytes, io.BytesIO], format_type: str):
56
  try:
57
  if isinstance(data, io.BytesIO):
 
 
58
  data.seek(0)
59
- return pd.read_excel(data, engine='openpyxl')
60
  elif isinstance(data, bytes):
61
- return pd.read_excel(io.BytesIO(data), engine='openpyxl')
 
 
62
  else:
63
  if format_type == 'csv':
64
  csv_data = []
 
41
  if time.time() - st.session_state.info_icons_time > 10 or ("messages" in st.session_state and len(st.session_state.messages) > 0):
42
  st.session_state.info_icons_displayed = False
43
 
44
+ def extract_data_from_markdown(text: Union[str, bytes, io.BytesIO]) -> Union[str, bytes, io.BytesIO, None]:
45
  if isinstance(text, io.BytesIO):
46
  return text
47
  if isinstance(text, bytes):
 
49
  pattern = r'```(csv|excel)\n(.*?)\n```'
50
  match = re.search(pattern, text, re.DOTALL)
51
  if match:
52
+ data_type = match.group(1)
53
+ data = match.group(2).strip()
54
+ if data_type == 'excel':
55
+ return io.BytesIO(data.encode())
56
+ return data
57
  return None
58
 
59
  def format_data(data: Union[str, bytes, io.BytesIO], format_type: str):
60
  try:
61
  if isinstance(data, io.BytesIO):
62
+ if format_type == 'excel':
63
+ return pd.read_excel(data, engine='openpyxl')
64
  data.seek(0)
65
+ return pd.read_csv(data)
66
  elif isinstance(data, bytes):
67
+ if format_type == 'excel':
68
+ return pd.read_excel(io.BytesIO(data), engine='openpyxl')
69
+ return pd.read_csv(io.BytesIO(data))
70
  else:
71
  if format_type == 'csv':
72
  csv_data = []
main.py CHANGED
@@ -1,7 +1,6 @@
1
  import streamlit as st
2
  import json
3
  import asyncio
4
- import logging
5
  from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
6
  from app.ui_components import display_info_icons, display_message, extract_data_from_markdown, format_data
7
  from app.utils import loading_animation, get_loading_message
@@ -11,6 +10,8 @@ import pandas as pd
11
  import base64
12
  from google_auth_oauthlib.flow import Flow
13
  import io
 
 
14
  from src.utils.google_sheets_utils import SCOPES, get_redirect_uri, display_google_sheets_button, initiate_google_auth
15
 
16
  def handle_oauth_callback():
@@ -28,18 +29,88 @@ def handle_oauth_callback():
28
  except Exception as e:
29
  st.error(f"Error during OAuth callback: {str(e)}")
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def safe_process_message(web_scraper_chat, message):
32
  if message is None or message.strip() == "":
33
  return "I'm sorry, but I didn't receive any input. Could you please try again?"
34
  try:
35
  response = web_scraper_chat.process_message(message)
36
- if isinstance(response, tuple) and len(response) == 2 and isinstance(response[1], pd.DataFrame):
37
- csv_string, df = response
38
- st.text("CSV Data:")
39
- st.code(csv_string, language="csv")
40
- st.text("Interactive Table:")
41
- st.dataframe(df)
42
- return csv_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  return response
44
  except AttributeError as e:
45
  if "'NoneType' object has no attribute 'lower'" in str(e):
@@ -47,19 +118,9 @@ def safe_process_message(web_scraper_chat, message):
47
  else:
48
  raise e
49
  except Exception as e:
 
50
  return f"An unexpected error occurred: {str(e)}. Please try again or contact support if the issue persists."
51
 
52
- def load_chat_history():
53
- try:
54
- with open("chat_history.json", "r") as f:
55
- return json.load(f)
56
- except FileNotFoundError:
57
- return {}
58
-
59
- def save_chat_history(chat_history):
60
- with open("chat_history.json", "w") as f:
61
- json.dump(chat_history, f)
62
-
63
  def get_date_group(date_str):
64
  date = datetime.strptime(date_str, "%Y-%m-%d")
65
  today = datetime.now().date()
@@ -95,13 +156,6 @@ async def list_ollama_models():
95
  st.error(f"Error fetching Ollama models: {str(e)}")
96
  return []
97
 
98
- def setup_logging(enable_logging):
99
- if enable_logging:
100
- logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
101
- return logging.getLogger(__name__)
102
- else:
103
- return logging.getLogger(__name__)
104
-
105
  def load_css():
106
  with open("app/styles.css", "r") as f:
107
  st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
@@ -124,19 +178,49 @@ def render_message(role, content, avatar_path):
124
 
125
  def display_message_with_sheets_upload(message, message_index):
126
  content = message["content"]
127
- if isinstance(content, (str, bytes, io.BytesIO)):
128
  data = extract_data_from_markdown(content)
129
  if data is not None:
130
- if isinstance(data, io.BytesIO) or (isinstance(content, str) and 'excel' in content.lower()):
131
- df = format_data(data, 'excel')
132
- else:
133
- df = format_data(data, 'csv')
134
-
135
- if df is not None:
136
- st.dataframe(df)
137
- display_google_sheets_button(df)
138
- else:
139
- st.warning("Failed to display data as a table. Showing raw content:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  st.code(content)
141
  else:
142
  st.markdown(content)
@@ -159,12 +243,6 @@ def main():
159
  user_avatar_path = "app/icons/man.png"
160
  ai_avatar_path = "app/icons/skull.png"
161
 
162
- if 'enable_logging' not in st.session_state:
163
- st.session_state.enable_logging = False
164
-
165
- logger = setup_logging(st.session_state.enable_logging)
166
- logger.debug("Starting CyberScraper 2077")
167
-
168
  if 'chat_history' not in st.session_state:
169
  st.session_state.chat_history = load_chat_history()
170
  if 'current_chat_id' not in st.session_state or st.session_state.current_chat_id not in st.session_state.chat_history:
@@ -186,12 +264,6 @@ def main():
186
  with st.sidebar:
187
  st.title("Conversation History")
188
 
189
- st.session_state.enable_logging = st.toggle("Enable Logging", st.session_state.enable_logging)
190
- if st.session_state.enable_logging:
191
- st.info("Logging is enabled. Check your console for log messages.")
192
- else:
193
- st.info("Logging is disabled.")
194
-
195
  # Model selection
196
  st.subheader("Select Model")
197
  default_models = ["gpt-4o-mini", "gpt-3.5-turbo"]
@@ -296,14 +368,9 @@ def main():
296
  prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
297
 
298
  if prompt:
299
- if st.session_state.enable_logging:
300
- logger.debug(f"Received prompt: {prompt}")
301
  st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
302
- save_chat_history(st.session_state.chat_history)
303
 
304
  if not st.session_state.web_scraper_chat:
305
- if st.session_state.enable_logging:
306
- logger.debug("Initializing web_scraper_chat")
307
  st.session_state.web_scraper_chat = initialize_web_scraper_chat()
308
 
309
  with st.chat_message("assistant"):
@@ -313,12 +380,14 @@ def main():
313
  st.session_state.web_scraper_chat,
314
  prompt
315
  )
 
316
  if full_response is not None:
317
- st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response})
 
 
 
318
  save_chat_history(st.session_state.chat_history)
319
  except Exception as e:
320
- if st.session_state.enable_logging:
321
- logger.error(f"An unexpected error occurred: {str(e)}")
322
  st.error(f"An unexpected error occurred: {str(e)}")
323
 
324
  st.rerun()
 
1
  import streamlit as st
2
  import json
3
  import asyncio
 
4
  from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
5
  from app.ui_components import display_info_icons, display_message, extract_data_from_markdown, format_data
6
  from app.utils import loading_animation, get_loading_message
 
10
  import base64
11
  from google_auth_oauthlib.flow import Flow
12
  import io
13
+ from io import BytesIO
14
+ import re
15
  from src.utils.google_sheets_utils import SCOPES, get_redirect_uri, display_google_sheets_button, initiate_google_auth
16
 
17
  def handle_oauth_callback():
 
29
  except Exception as e:
30
  st.error(f"Error during OAuth callback: {str(e)}")
31
 
32
+ def serialize_bytesio(obj):
33
+ if isinstance(obj, BytesIO):
34
+ return {
35
+ "_type": "BytesIO",
36
+ "data": base64.b64encode(obj.getvalue()).decode('utf-8')
37
+ }
38
+ raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
39
+
40
+ def deserialize_bytesio(obj):
41
+ if isinstance(obj, dict) and "_type" in obj and obj["_type"] == "BytesIO":
42
+ return BytesIO(base64.b64decode(obj["data"]))
43
+ return obj
44
+
45
+ def save_chat_history(chat_history):
46
+ with open("chat_history.json", "w") as f:
47
+ json.dump(chat_history, f, default=serialize_bytesio)
48
+
49
+ def load_chat_history():
50
+ try:
51
+ with open("chat_history.json", "r") as f:
52
+ return json.load(f, object_hook=deserialize_bytesio)
53
+ except FileNotFoundError:
54
+ return {}
55
+
56
  def safe_process_message(web_scraper_chat, message):
57
  if message is None or message.strip() == "":
58
  return "I'm sorry, but I didn't receive any input. Could you please try again?"
59
  try:
60
  response = web_scraper_chat.process_message(message)
61
+ st.write("Debug: Response type:", type(response))
62
+
63
+ if isinstance(response, tuple):
64
+ st.write("Debug: Response is a tuple")
65
+ if len(response) == 2 and isinstance(response[1], pd.DataFrame):
66
+ st.write("Debug: CSV data detected")
67
+ csv_string, df = response
68
+ st.text("CSV Data:")
69
+ st.code(csv_string, language="csv")
70
+ st.text("Interactive Table:")
71
+ st.dataframe(df)
72
+
73
+ csv_buffer = BytesIO()
74
+ df.to_csv(csv_buffer, index=False)
75
+ csv_buffer.seek(0)
76
+ st.download_button(
77
+ label="Download CSV",
78
+ data=csv_buffer,
79
+ file_name="data.csv",
80
+ mime="text/csv"
81
+ )
82
+
83
+ return csv_string
84
+ elif len(response) == 2 and isinstance(response[0], BytesIO):
85
+ st.write("Debug: Excel data detected")
86
+ excel_buffer, df = response
87
+ st.text("Excel Data:")
88
+ st.dataframe(df)
89
+
90
+ excel_buffer.seek(0)
91
+ st.download_button(
92
+ label="Download Original Excel file",
93
+ data=excel_buffer,
94
+ file_name="data_original.xlsx",
95
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
96
+ )
97
+
98
+ excel_data = BytesIO()
99
+ with pd.ExcelWriter(excel_data, engine='xlsxwriter') as writer:
100
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
101
+ excel_data.seek(0)
102
+
103
+ st.download_button(
104
+ label="Download Excel (from DataFrame)",
105
+ data=excel_data,
106
+ file_name="data_from_df.xlsx",
107
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
108
+ )
109
+
110
+ return ("Excel data displayed and available for download.", excel_buffer)
111
+ else:
112
+ st.write("Debug: Response is not a tuple")
113
+
114
  return response
115
  except AttributeError as e:
116
  if "'NoneType' object has no attribute 'lower'" in str(e):
 
118
  else:
119
  raise e
120
  except Exception as e:
121
+ st.write("Debug: Exception occurred:", str(e))
122
  return f"An unexpected error occurred: {str(e)}. Please try again or contact support if the issue persists."
123
 
 
 
 
 
 
 
 
 
 
 
 
124
  def get_date_group(date_str):
125
  date = datetime.strptime(date_str, "%Y-%m-%d")
126
  today = datetime.now().date()
 
156
  st.error(f"Error fetching Ollama models: {str(e)}")
157
  return []
158
 
 
 
 
 
 
 
 
159
  def load_css():
160
  with open("app/styles.css", "r") as f:
161
  st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
 
178
 
179
  def display_message_with_sheets_upload(message, message_index):
180
  content = message["content"]
181
+ if isinstance(content, (str, bytes, BytesIO)):
182
  data = extract_data_from_markdown(content)
183
  if data is not None:
184
+ try:
185
+ is_excel = isinstance(data, BytesIO) or (isinstance(content, str) and 'excel' in content.lower())
186
+ if is_excel:
187
+ df = format_data(data, 'excel')
188
+ else:
189
+ df = format_data(data, 'csv')
190
+
191
+ if df is not None:
192
+ st.dataframe(df)
193
+
194
+ if not is_excel:
195
+ csv_buffer = BytesIO()
196
+ df.to_csv(csv_buffer, index=False)
197
+ csv_buffer.seek(0)
198
+ st.download_button(
199
+ label="📥 Download as CSV",
200
+ data=csv_buffer,
201
+ file_name="data.csv",
202
+ mime="text/csv",
203
+ key=f"csv_download_{message_index}"
204
+ )
205
+ else:
206
+ excel_buffer = BytesIO()
207
+ with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
208
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
209
+ excel_buffer.seek(0)
210
+ st.download_button(
211
+ label="📥 Download as Excel",
212
+ data=excel_buffer,
213
+ file_name="data.xlsx",
214
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
215
+ key=f"excel_download_{message_index}"
216
+ )
217
+
218
+ display_google_sheets_button(df, f"sheets_upload_{message_index}")
219
+ else:
220
+ st.warning("Failed to display data as a table. Showing raw content:")
221
+ st.code(content)
222
+ except Exception as e:
223
+ st.error(f"Error processing data: {str(e)}")
224
  st.code(content)
225
  else:
226
  st.markdown(content)
 
243
  user_avatar_path = "app/icons/man.png"
244
  ai_avatar_path = "app/icons/skull.png"
245
 
 
 
 
 
 
 
246
  if 'chat_history' not in st.session_state:
247
  st.session_state.chat_history = load_chat_history()
248
  if 'current_chat_id' not in st.session_state or st.session_state.current_chat_id not in st.session_state.chat_history:
 
264
  with st.sidebar:
265
  st.title("Conversation History")
266
 
 
 
 
 
 
 
267
  # Model selection
268
  st.subheader("Select Model")
269
  default_models = ["gpt-4o-mini", "gpt-3.5-turbo"]
 
368
  prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
369
 
370
  if prompt:
 
 
371
  st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
 
372
 
373
  if not st.session_state.web_scraper_chat:
 
 
374
  st.session_state.web_scraper_chat = initialize_web_scraper_chat()
375
 
376
  with st.chat_message("assistant"):
 
380
  st.session_state.web_scraper_chat,
381
  prompt
382
  )
383
+ st.write("Debug: Full response type:", type(full_response))
384
  if full_response is not None:
385
+ if isinstance(full_response, tuple) and len(full_response) == 2 and isinstance(full_response[1], BytesIO):
386
+ st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response[0]})
387
+ else:
388
+ st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response})
389
  save_chat_history(st.session_state.chat_history)
390
  except Exception as e:
 
 
391
  st.error(f"An unexpected error occurred: {str(e)}")
392
 
393
  st.rerun()
src/ollama_models.py CHANGED
@@ -1,19 +1,14 @@
1
  import requests
2
  from typing import List, Dict, Any
3
- import logging
4
  import os
5
  import json
6
 
7
  class OllamaModel:
8
  def __init__(self, model_name: str):
9
  self.model_name = model_name
10
- self.logger = logging.getLogger(__name__)
11
- self.logger.setLevel(logging.DEBUG)
12
  self.base_url = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
13
 
14
  async def generate(self, prompt: str, system_prompt: str = "") -> str:
15
- self.logger.debug(f"Generating with Ollama model: {self.model_name}")
16
- self.logger.debug(f"Prompt (first 500 chars): {prompt[:500]}...")
17
  try:
18
  response = requests.post(
19
  f"{self.base_url}/api/generate",
@@ -35,26 +30,22 @@ class OllamaModel:
35
  if 'response' in data:
36
  full_response += data['response']
37
  except json.JSONDecodeError:
38
- self.logger.warning(f"Failed to parse JSON: {line}")
39
 
40
- self.logger.debug(f"Ollama response (first 500 chars): {full_response[:500]}...")
41
  return full_response
42
  except Exception as e:
43
- self.logger.error(f"Error generating with Ollama: {str(e)}")
44
  raise
45
 
46
  @staticmethod
47
  async def list_models() -> List[str]:
48
- logger = logging.getLogger(__name__)
49
  base_url = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
50
  try:
51
  response = requests.get(f"{base_url}/api/tags")
52
  response.raise_for_status()
53
  models = response.json()
54
- logger.debug(f"Available Ollama models: {models['models']}")
55
  return [model['name'] for model in models['models']]
56
  except Exception as e:
57
- logger.error(f"Error listing Ollama models: {str(e)}")
58
  return []
59
 
60
  class OllamaModelManager:
 
1
  import requests
2
  from typing import List, Dict, Any
 
3
  import os
4
  import json
5
 
6
  class OllamaModel:
7
  def __init__(self, model_name: str):
8
  self.model_name = model_name
 
 
9
  self.base_url = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
10
 
11
  async def generate(self, prompt: str, system_prompt: str = "") -> str:
 
 
12
  try:
13
  response = requests.post(
14
  f"{self.base_url}/api/generate",
 
30
  if 'response' in data:
31
  full_response += data['response']
32
  except json.JSONDecodeError:
33
+ print(f"Error decoding JSON: {line}")
34
 
 
35
  return full_response
36
  except Exception as e:
37
+ print(f"An error occurred: {str(e)}")
38
  raise
39
 
40
  @staticmethod
41
  async def list_models() -> List[str]:
 
42
  base_url = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
43
  try:
44
  response = requests.get(f"{base_url}/api/tags")
45
  response.raise_for_status()
46
  models = response.json()
 
47
  return [model['name'] for model in models['models']]
48
  except Exception as e:
 
49
  return []
50
 
51
  class OllamaModelManager:
src/utils/google_sheets_utils.py CHANGED
@@ -8,11 +8,9 @@ import pandas as pd
8
  from datetime import datetime
9
  import os
10
  import json
11
- import logging
12
  import hashlib
13
-
14
- logging.basicConfig(level=logging.DEBUG)
15
- logger = logging.getLogger(__name__)
16
 
17
  SCOPES = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive.file']
18
  TOKEN_FILE = 'token.json'
@@ -38,18 +36,16 @@ def get_google_sheets_credentials():
38
  if os.path.exists(TOKEN_FILE):
39
  try:
40
  creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
41
- logger.debug("Loaded credentials from token file")
42
  except Exception as e:
43
- logger.error(f"Error loading credentials from token file: {str(e)}")
44
 
45
  if not creds or not creds.valid:
46
  if creds and creds.expired and creds.refresh_token:
47
  try:
48
  creds.refresh(Request())
49
- logger.debug("Refreshed expired credentials")
50
  save_credentials(creds)
51
  except Exception as e:
52
- logger.error(f"Error refreshing credentials: {str(e)}")
53
  creds = None
54
  else:
55
  creds = None
@@ -58,31 +54,41 @@ def get_google_sheets_credentials():
58
  if 'google_auth_token' in st.session_state:
59
  try:
60
  creds = Credentials.from_authorized_user_info(json.loads(st.session_state['google_auth_token']), SCOPES)
61
- logger.debug("Loaded credentials from session state")
62
  save_credentials(creds)
63
  except Exception as e:
64
- logger.error(f"Error loading credentials from session state: {str(e)}")
65
-
66
  return creds
67
 
68
  def save_credentials(creds):
69
  try:
70
  with open(TOKEN_FILE, 'w') as token:
71
  token.write(creds.to_json())
72
- logger.debug("Saved credentials to token file")
73
  except Exception as e:
74
- logger.error(f"Error saving credentials to token file: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def upload_to_google_sheets(df):
 
 
77
  creds = get_google_sheets_credentials()
78
  if not creds:
79
- logger.error("Failed to obtain valid credentials.")
80
  return None
81
 
82
  try:
83
- service = build('sheets', 'v4', credentials=creds)
84
- logger.debug("Built Sheets service")
85
-
86
  spreadsheet = {
87
  'properties': {
88
  'title': f"CyberScraper Data {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
@@ -90,38 +96,38 @@ def upload_to_google_sheets(df):
90
  }
91
  spreadsheet = service.spreadsheets().create(body=spreadsheet, fields='spreadsheetId').execute()
92
  spreadsheet_id = spreadsheet.get('spreadsheetId')
93
- logger.debug(f"Created new spreadsheet with ID: {spreadsheet_id}")
 
 
 
 
94
 
95
  values = [df.columns.tolist()] + df.values.tolist()
96
  body = {'values': values}
97
  result = service.spreadsheets().values().update(
98
  spreadsheetId=spreadsheet_id, range='Sheet1',
99
  valueInputOption='RAW', body=body).execute()
100
- logger.debug(f"Updated spreadsheet. Cells updated: {result.get('updatedCells')}")
101
-
102
  return spreadsheet_id
103
  except HttpError as error:
104
- logger.error(f"An HTTP error occurred: {error}")
105
  return None
106
  except Exception as e:
107
- logger.error(f"An unexpected error occurred: {str(e)}")
108
  return None
109
 
110
- def display_google_sheets_button(df):
111
- df_hash = hash(str(df))
112
-
113
  creds = get_google_sheets_credentials()
114
  if not creds:
115
  auth_button = '🔑 Authorize Google Sheets'
116
- if st.button(auth_button, key=f"auth_sheets_{df_hash}", help="Authorize access to Google Sheets"):
117
  initiate_google_auth()
118
  else:
119
  upload_button = '✅ Upload to Google Sheets'
120
- if st.button(upload_button, key=f"upload_{df_hash}", help="Upload data to Google Sheets"):
121
  with st.spinner("Uploading to Google Sheets..."):
122
- spreadsheet_id = upload_to_google_sheets(df)
123
  if spreadsheet_id:
124
  st.success(f"Data uploaded successfully. Spreadsheet ID: {spreadsheet_id}")
125
  st.markdown(f"[Open Spreadsheet](https://docs.google.com/spreadsheets/d/{spreadsheet_id})")
126
  else:
127
- st.error("Failed to upload data to Google Sheets.")
 
8
  from datetime import datetime
9
  import os
10
  import json
 
11
  import hashlib
12
+ import re
13
+ from io import BytesIO
 
14
 
15
  SCOPES = ['https://www.googleapis.com/auth/spreadsheets', 'https://www.googleapis.com/auth/drive.file']
16
  TOKEN_FILE = 'token.json'
 
36
  if os.path.exists(TOKEN_FILE):
37
  try:
38
  creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)
 
39
  except Exception as e:
40
+ print(f"Error loading credentials from file: {str(e)}")
41
 
42
  if not creds or not creds.valid:
43
  if creds and creds.expired and creds.refresh_token:
44
  try:
45
  creds.refresh(Request())
 
46
  save_credentials(creds)
47
  except Exception as e:
48
+ print(f"Error refreshing credentials: {str(e)}")
49
  creds = None
50
  else:
51
  creds = None
 
54
  if 'google_auth_token' in st.session_state:
55
  try:
56
  creds = Credentials.from_authorized_user_info(json.loads(st.session_state['google_auth_token']), SCOPES)
 
57
  save_credentials(creds)
58
  except Exception as e:
59
+ print(f"Error creating credentials from session state: {str(e)}")
 
60
  return creds
61
 
62
  def save_credentials(creds):
63
  try:
64
  with open(TOKEN_FILE, 'w') as token:
65
  token.write(creds.to_json())
 
66
  except Exception as e:
67
+ print(f"Error saving credentials: {str(e)}")
68
+
69
+ def clean_data_for_sheets(df):
70
+ def clean_value(val):
71
+ if pd.isna(val):
72
+ return ""
73
+ if isinstance(val, (int, float)):
74
+ return str(val)
75
+ return str(val).replace('\n', ' ').replace('\r', '')
76
+
77
+ for col in df.columns:
78
+ df[col] = df[col].map(clean_value)
79
+
80
+ if 'comments' in df.columns:
81
+ df['comments'] = df['comments'].astype(str)
82
 
83
+ return df
84
+
85
+ def upload_to_google_sheets(data):
86
  creds = get_google_sheets_credentials()
87
  if not creds:
 
88
  return None
89
 
90
  try:
91
+ service = build('sheets', 'v4', credentials=creds)
 
 
92
  spreadsheet = {
93
  'properties': {
94
  'title': f"CyberScraper Data {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
 
96
  }
97
  spreadsheet = service.spreadsheets().create(body=spreadsheet, fields='spreadsheetId').execute()
98
  spreadsheet_id = spreadsheet.get('spreadsheetId')
99
+
100
+ if isinstance(data, pd.DataFrame):
101
+ df = clean_data_for_sheets(data)
102
+ else:
103
+ return None
104
 
105
  values = [df.columns.tolist()] + df.values.tolist()
106
  body = {'values': values}
107
  result = service.spreadsheets().values().update(
108
  spreadsheetId=spreadsheet_id, range='Sheet1',
109
  valueInputOption='RAW', body=body).execute()
 
 
110
  return spreadsheet_id
111
  except HttpError as error:
112
+ print(f"An HTTP error occurred: {error}")
113
  return None
114
  except Exception as e:
115
+ print(f"An error occurred: {str(e)}")
116
  return None
117
 
118
+ def display_google_sheets_button(data, unique_key):
 
 
119
  creds = get_google_sheets_credentials()
120
  if not creds:
121
  auth_button = '🔑 Authorize Google Sheets'
122
+ if st.button(auth_button, key=f"auth_sheets_{unique_key}", help="Authorize access to Google Sheets"):
123
  initiate_google_auth()
124
  else:
125
  upload_button = '✅ Upload to Google Sheets'
126
+ if st.button(upload_button, key=f"upload_{unique_key}", help="Upload data to Google Sheets"):
127
  with st.spinner("Uploading to Google Sheets..."):
128
+ spreadsheet_id = upload_to_google_sheets(data)
129
  if spreadsheet_id:
130
  st.success(f"Data uploaded successfully. Spreadsheet ID: {spreadsheet_id}")
131
  st.markdown(f"[Open Spreadsheet](https://docs.google.com/spreadsheets/d/{spreadsheet_id})")
132
  else:
133
+ st.error("Failed to upload data to Google Sheets. Check the console for error details.")
src/web_extractor.py CHANGED
@@ -2,7 +2,8 @@ import asyncio
2
  from typing import Dict, Any, Optional, List, Tuple
3
  import json
4
  import pandas as pd
5
- from io import StringIO
 
6
  import re
7
  from functools import lru_cache
8
  import hashlib
@@ -17,7 +18,6 @@ from langchain.prompts import PromptTemplate
17
  from langchain.schema.runnable import RunnableSequence
18
  from langchain.text_splitter import RecursiveCharacterTextSplitter
19
  import tiktoken
20
- import logging
21
  import csv
22
  from bs4 import BeautifulSoup, Comment
23
 
@@ -46,8 +46,6 @@ class WebExtractor:
46
  length_function=self.num_tokens_from_string,
47
  )
48
  self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
49
- self.logger = logging.getLogger(__name__)
50
- self.logger.setLevel(logging.DEBUG)
51
  self.query_cache = {}
52
  self.content_hash = None
53
 
@@ -156,9 +154,7 @@ class WebExtractor:
156
 
157
  return text
158
 
159
- async def _extract_info(self, query: str) -> str:
160
- self.logger.debug(f"Extracting info with model: {self.model}")
161
-
162
  if not self.preprocessed_content:
163
  return "Please provide a URL first before asking for information."
164
 
@@ -179,15 +175,12 @@ class WebExtractor:
179
  extracted_data = await self._cached_api_call(content_hash, query)
180
  else:
181
  chunks = self.optimized_text_splitter(self.preprocessed_content)
182
- self.logger.debug(f"Content split into {len(chunks)} chunks")
183
  all_extracted_data = []
184
  for i, chunk in enumerate(chunks):
185
  chunk_data = await self._cached_api_call(self._hash_content(chunk), query)
186
  all_extracted_data.append(chunk_data)
187
  extracted_data = self._merge_json_chunks(all_extracted_data)
188
 
189
- self.logger.debug(f"Extracted data (first 500 chars): {extracted_data[:500]}...")
190
-
191
  formatted_result = self._format_result(extracted_data, query)
192
  self.query_cache[cache_key] = formatted_result
193
  return formatted_result
@@ -199,7 +192,7 @@ class WebExtractor:
199
  csv_string, df = self._format_as_csv(extracted_data)
200
  return f"```csv\n{csv_string}\n```", df
201
  elif 'excel' in query.lower():
202
- return self._format_as_excel_and_save(extracted_data)
203
  elif 'sql' in query.lower():
204
  return self._format_as_sql(extracted_data)
205
  elif 'html' in query.lower():
@@ -220,7 +213,7 @@ class WebExtractor:
220
  else:
221
  merged_data.append(data)
222
  except json.JSONDecodeError:
223
- self.logger.error(f"Failed to parse JSON chunk: {chunk[:100]}...")
224
  return json.dumps(merged_data)
225
 
226
  def _format_as_json(self, data: str) -> str:
@@ -260,15 +253,13 @@ class WebExtractor:
260
 
261
  return csv_string, df
262
  except json.JSONDecodeError as e:
263
- self.logger.error(f"JSON Decode Error: {str(e)}")
264
  error_msg = f"Error: Invalid JSON data. Raw data: {data[:500]}..."
265
  return error_msg, pd.DataFrame()
266
  except Exception as e:
267
- self.logger.error(f"Unexpected error in _format_as_csv: {str(e)}")
268
  error_msg = f"Error: Failed to convert data to CSV. {str(e)}"
269
  return error_msg, pd.DataFrame()
270
 
271
- def _format_as_excel_and_save(self, data: str) -> str:
272
  json_pattern = r'```json\s*([\s\S]*?)\s*```'
273
  match = re.search(json_pattern, data)
274
  if match:
@@ -276,17 +267,21 @@ class WebExtractor:
276
  try:
277
  parsed_data = json.loads(data)
278
  if not parsed_data:
279
- return "No data to convert to Excel."
280
 
281
  df = pd.DataFrame(parsed_data)
282
- output_filename = "output.xlsx"
283
- with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
284
- df.to_excel(writer, index=False)
285
- return f"Excel data saved to {output_filename}"
 
 
286
  except json.JSONDecodeError:
287
- return f"Error: Invalid JSON data. Raw data: {data[:500]}..."
 
288
  except Exception as e:
289
- return f"Error: Failed to convert data to Excel. {str(e)}"
 
290
 
291
  def _format_as_sql(self, data: str) -> str:
292
  json_pattern = r'```json\s*([\s\S]*?)\s*```'
 
2
  from typing import Dict, Any, Optional, List, Tuple
3
  import json
4
  import pandas as pd
5
+ from io import StringIO, BytesIO
6
+ import base64
7
  import re
8
  from functools import lru_cache
9
  import hashlib
 
18
  from langchain.schema.runnable import RunnableSequence
19
  from langchain.text_splitter import RecursiveCharacterTextSplitter
20
  import tiktoken
 
21
  import csv
22
  from bs4 import BeautifulSoup, Comment
23
 
 
46
  length_function=self.num_tokens_from_string,
47
  )
48
  self.max_tokens = 128000 if model_name == "gpt-4o-mini" else 16385
 
 
49
  self.query_cache = {}
50
  self.content_hash = None
51
 
 
154
 
155
  return text
156
 
157
+ async def _extract_info(self, query: str) -> str:
 
 
158
  if not self.preprocessed_content:
159
  return "Please provide a URL first before asking for information."
160
 
 
175
  extracted_data = await self._cached_api_call(content_hash, query)
176
  else:
177
  chunks = self.optimized_text_splitter(self.preprocessed_content)
 
178
  all_extracted_data = []
179
  for i, chunk in enumerate(chunks):
180
  chunk_data = await self._cached_api_call(self._hash_content(chunk), query)
181
  all_extracted_data.append(chunk_data)
182
  extracted_data = self._merge_json_chunks(all_extracted_data)
183
 
 
 
184
  formatted_result = self._format_result(extracted_data, query)
185
  self.query_cache[cache_key] = formatted_result
186
  return formatted_result
 
192
  csv_string, df = self._format_as_csv(extracted_data)
193
  return f"```csv\n{csv_string}\n```", df
194
  elif 'excel' in query.lower():
195
+ return self._format_as_excel(extracted_data)
196
  elif 'sql' in query.lower():
197
  return self._format_as_sql(extracted_data)
198
  elif 'html' in query.lower():
 
213
  else:
214
  merged_data.append(data)
215
  except json.JSONDecodeError:
216
+ print(f"Error decoding JSON chunk: {chunk[:100]}...")
217
  return json.dumps(merged_data)
218
 
219
  def _format_as_json(self, data: str) -> str:
 
253
 
254
  return csv_string, df
255
  except json.JSONDecodeError as e:
 
256
  error_msg = f"Error: Invalid JSON data. Raw data: {data[:500]}..."
257
  return error_msg, pd.DataFrame()
258
  except Exception as e:
 
259
  error_msg = f"Error: Failed to convert data to CSV. {str(e)}"
260
  return error_msg, pd.DataFrame()
261
 
262
+ def _format_as_excel(self, data: str) -> Tuple[BytesIO, pd.DataFrame]:
263
  json_pattern = r'```json\s*([\s\S]*?)\s*```'
264
  match = re.search(json_pattern, data)
265
  if match:
 
267
  try:
268
  parsed_data = json.loads(data)
269
  if not parsed_data:
270
+ return BytesIO(b"No data to convert to Excel."), pd.DataFrame()
271
 
272
  df = pd.DataFrame(parsed_data)
273
+ excel_buffer = BytesIO()
274
+ with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
275
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
276
+ excel_buffer.seek(0)
277
+
278
+ return excel_buffer, df
279
  except json.JSONDecodeError:
280
+ error_msg = f"Error: Invalid JSON data. Raw data: {data[:500]}..."
281
+ return BytesIO(error_msg.encode()), pd.DataFrame()
282
  except Exception as e:
283
+ error_msg = f"Error: Failed to convert data to Excel. {str(e)}"
284
+ return BytesIO(error_msg.encode()), pd.DataFrame()
285
 
286
  def _format_as_sql(self, data: str) -> str:
287
  json_pattern = r'```json\s*([\s\S]*?)\s*```'