itsOwen commited on
Commit
c60fc06
·
1 Parent(s): a57b699

chat fix, a few other fixes

Browse files
main.py CHANGED
@@ -286,7 +286,8 @@ def main():
286
  new_chat_id = str(datetime.now().timestamp())
287
  st.session_state.chat_history[new_chat_id] = {
288
  "messages": [],
289
- "date": datetime.now().strftime("%Y-%m-%d")
 
290
  }
291
  st.session_state.current_chat_id = new_chat_id
292
  st.session_state.web_scraper_chat = None
@@ -305,12 +306,12 @@ def main():
305
  for chat_id, chat_data in chats:
306
  messages = chat_data['messages']
307
  if messages:
308
- button_label = f"{messages[0]['content'][:25]}..."
309
  else:
310
- button_label = "🗨️ Empty Chat"
311
-
312
  col1, col2 = st.columns([0.85, 0.15])
313
-
314
  with col1:
315
  if st.button(button_label, key=f"history_{chat_id}", use_container_width=True):
316
  st.session_state.current_chat_id = chat_id
 
286
  new_chat_id = str(datetime.now().timestamp())
287
  st.session_state.chat_history[new_chat_id] = {
288
  "messages": [],
289
+ "date": datetime.now().strftime("%Y-%m-%d"),
290
+ "name": "🗨️ New Chat"
291
  }
292
  st.session_state.current_chat_id = new_chat_id
293
  st.session_state.web_scraper_chat = None
 
306
  for chat_id, chat_data in chats:
307
  messages = chat_data['messages']
308
  if messages:
309
+ button_label = chat_data.get('name', f"{messages[0]['content'][:25]}...")
310
  else:
311
+ button_label = chat_data.get('name', "🗨️ Empty Chat")
312
+
313
  col1, col2 = st.columns([0.85, 0.15])
314
+
315
  with col1:
316
  if st.button(button_label, key=f"history_{chat_id}", use_container_width=True):
317
  st.session_state.current_chat_id = chat_id
src/scrapers/playwright_scraper.py CHANGED
@@ -17,7 +17,7 @@ class ScraperConfig:
17
  headless: bool = True,
18
  debug: bool = False,
19
  timeout: int = 60000,
20
- wait_for: str = 'networkidle'):
21
  self.use_stealth = use_stealth
22
  self.simulate_human = simulate_human
23
  self.use_custom_headers = use_custom_headers
 
17
  headless: bool = True,
18
  debug: bool = False,
19
  timeout: int = 60000,
20
+ wait_for: str = 'domcontentloaded'): # use networkidle instead of domcontentloaded if you want!
21
  self.use_stealth = use_stealth
22
  self.simulate_human = simulate_human
23
  self.use_custom_headers = use_custom_headers
src/web_extractor.py CHANGED
@@ -21,6 +21,8 @@ import tiktoken
21
  import csv
22
  from bs4 import BeautifulSoup, Comment
23
  from .scrapers.playwright_scraper import PlaywrightScraper, ScraperConfig
 
 
24
 
25
  class WebExtractor:
26
  def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None, headless: bool = True, debug: bool = False):
@@ -60,6 +62,13 @@ class WebExtractor:
60
  def _hash_content(self, content: str) -> str:
61
  return hashlib.md5(content.encode()).hexdigest()
62
 
 
 
 
 
 
 
 
63
  @lru_cache(maxsize=100)
64
  async def _cached_api_call(self, content_hash: str, query: str) -> str:
65
  if isinstance(self.model, OllamaModel):
@@ -114,7 +123,11 @@ class WebExtractor:
114
  pages = parts[1] if len(parts) > 1 and not parts[1].startswith('-') else None
115
  url_pattern = parts[2] if len(parts) > 2 and not parts[2].startswith('-') else None
116
  handle_captcha = '-captcha' in user_input.lower()
117
-
 
 
 
 
118
  response = await self._fetch_url(url, pages, url_pattern, handle_captcha)
119
  elif not self.current_content:
120
  response = "Please provide a URL first before asking for information."
 
21
  import csv
22
  from bs4 import BeautifulSoup, Comment
23
  from .scrapers.playwright_scraper import PlaywrightScraper, ScraperConfig
24
+ from urllib.parse import urlparse
25
+ import streamlit as st
26
 
27
  class WebExtractor:
28
  def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None, headless: bool = True, debug: bool = False):
 
62
  def _hash_content(self, content: str) -> str:
63
  return hashlib.md5(content.encode()).hexdigest()
64
 
65
+ def get_website_name(self, url: str) -> str:
66
+ parsed_url = urlparse(url)
67
+ domain = parsed_url.netloc
68
+ if domain.startswith('www.'):
69
+ domain = domain[4:]
70
+ return domain.split('.')[0].capitalize()
71
+
72
  @lru_cache(maxsize=100)
73
  async def _cached_api_call(self, content_hash: str, query: str) -> str:
74
  if isinstance(self.model, OllamaModel):
 
123
  pages = parts[1] if len(parts) > 1 and not parts[1].startswith('-') else None
124
  url_pattern = parts[2] if len(parts) > 2 and not parts[2].startswith('-') else None
125
  handle_captcha = '-captcha' in user_input.lower()
126
+
127
+ website_name = self.get_website_name(url)
128
+
129
+ st.session_state.chat_history[st.session_state.current_chat_id]["name"] = website_name
130
+
131
  response = await self._fetch_url(url, pages, url_pattern, handle_captcha)
132
  elif not self.current_content:
133
  response = "Please provide a URL first before asking for information."