Spaces:
Paused
Paused
itsOwen commited on
Commit ·
c60fc06
1
Parent(s): a57b699
chat fix, a few other fixes
Browse files- main.py +6 -5
- src/scrapers/playwright_scraper.py +1 -1
- src/web_extractor.py +14 -1
main.py
CHANGED
|
@@ -286,7 +286,8 @@ def main():
|
|
| 286 |
new_chat_id = str(datetime.now().timestamp())
|
| 287 |
st.session_state.chat_history[new_chat_id] = {
|
| 288 |
"messages": [],
|
| 289 |
-
"date": datetime.now().strftime("%Y-%m-%d")
|
|
|
|
| 290 |
}
|
| 291 |
st.session_state.current_chat_id = new_chat_id
|
| 292 |
st.session_state.web_scraper_chat = None
|
|
@@ -305,12 +306,12 @@ def main():
|
|
| 305 |
for chat_id, chat_data in chats:
|
| 306 |
messages = chat_data['messages']
|
| 307 |
if messages:
|
| 308 |
-
button_label = f"{messages[0]['content'][:25]}..."
|
| 309 |
else:
|
| 310 |
-
button_label = "🗨️ Empty Chat"
|
| 311 |
-
|
| 312 |
col1, col2 = st.columns([0.85, 0.15])
|
| 313 |
-
|
| 314 |
with col1:
|
| 315 |
if st.button(button_label, key=f"history_{chat_id}", use_container_width=True):
|
| 316 |
st.session_state.current_chat_id = chat_id
|
|
|
|
| 286 |
new_chat_id = str(datetime.now().timestamp())
|
| 287 |
st.session_state.chat_history[new_chat_id] = {
|
| 288 |
"messages": [],
|
| 289 |
+
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 290 |
+
"name": "🗨️ New Chat"
|
| 291 |
}
|
| 292 |
st.session_state.current_chat_id = new_chat_id
|
| 293 |
st.session_state.web_scraper_chat = None
|
|
|
|
| 306 |
for chat_id, chat_data in chats:
|
| 307 |
messages = chat_data['messages']
|
| 308 |
if messages:
|
| 309 |
+
button_label = chat_data.get('name', f"{messages[0]['content'][:25]}...")
|
| 310 |
else:
|
| 311 |
+
button_label = chat_data.get('name', "🗨️ Empty Chat")
|
| 312 |
+
|
| 313 |
col1, col2 = st.columns([0.85, 0.15])
|
| 314 |
+
|
| 315 |
with col1:
|
| 316 |
if st.button(button_label, key=f"history_{chat_id}", use_container_width=True):
|
| 317 |
st.session_state.current_chat_id = chat_id
|
src/scrapers/playwright_scraper.py
CHANGED
|
@@ -17,7 +17,7 @@ class ScraperConfig:
|
|
| 17 |
headless: bool = True,
|
| 18 |
debug: bool = False,
|
| 19 |
timeout: int = 60000,
|
| 20 |
-
wait_for: str = '
|
| 21 |
self.use_stealth = use_stealth
|
| 22 |
self.simulate_human = simulate_human
|
| 23 |
self.use_custom_headers = use_custom_headers
|
|
|
|
| 17 |
headless: bool = True,
|
| 18 |
debug: bool = False,
|
| 19 |
timeout: int = 60000,
|
| 20 |
+
wait_for: str = 'domcontentloaded'): # use networkidle instead of domcontentloaded if you want!
|
| 21 |
self.use_stealth = use_stealth
|
| 22 |
self.simulate_human = simulate_human
|
| 23 |
self.use_custom_headers = use_custom_headers
|
src/web_extractor.py
CHANGED
|
@@ -21,6 +21,8 @@ import tiktoken
|
|
| 21 |
import csv
|
| 22 |
from bs4 import BeautifulSoup, Comment
|
| 23 |
from .scrapers.playwright_scraper import PlaywrightScraper, ScraperConfig
|
|
|
|
|
|
|
| 24 |
|
| 25 |
class WebExtractor:
|
| 26 |
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None, headless: bool = True, debug: bool = False):
|
|
@@ -60,6 +62,13 @@ class WebExtractor:
|
|
| 60 |
def _hash_content(self, content: str) -> str:
|
| 61 |
return hashlib.md5(content.encode()).hexdigest()
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
@lru_cache(maxsize=100)
|
| 64 |
async def _cached_api_call(self, content_hash: str, query: str) -> str:
|
| 65 |
if isinstance(self.model, OllamaModel):
|
|
@@ -114,7 +123,11 @@ class WebExtractor:
|
|
| 114 |
pages = parts[1] if len(parts) > 1 and not parts[1].startswith('-') else None
|
| 115 |
url_pattern = parts[2] if len(parts) > 2 and not parts[2].startswith('-') else None
|
| 116 |
handle_captcha = '-captcha' in user_input.lower()
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
response = await self._fetch_url(url, pages, url_pattern, handle_captcha)
|
| 119 |
elif not self.current_content:
|
| 120 |
response = "Please provide a URL first before asking for information."
|
|
|
|
| 21 |
import csv
|
| 22 |
from bs4 import BeautifulSoup, Comment
|
| 23 |
from .scrapers.playwright_scraper import PlaywrightScraper, ScraperConfig
|
| 24 |
+
from urllib.parse import urlparse
|
| 25 |
+
import streamlit as st
|
| 26 |
|
| 27 |
class WebExtractor:
|
| 28 |
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None, headless: bool = True, debug: bool = False):
|
|
|
|
| 62 |
def _hash_content(self, content: str) -> str:
|
| 63 |
return hashlib.md5(content.encode()).hexdigest()
|
| 64 |
|
| 65 |
+
def get_website_name(self, url: str) -> str:
|
| 66 |
+
parsed_url = urlparse(url)
|
| 67 |
+
domain = parsed_url.netloc
|
| 68 |
+
if domain.startswith('www.'):
|
| 69 |
+
domain = domain[4:]
|
| 70 |
+
return domain.split('.')[0].capitalize()
|
| 71 |
+
|
| 72 |
@lru_cache(maxsize=100)
|
| 73 |
async def _cached_api_call(self, content_hash: str, query: str) -> str:
|
| 74 |
if isinstance(self.model, OllamaModel):
|
|
|
|
| 123 |
pages = parts[1] if len(parts) > 1 and not parts[1].startswith('-') else None
|
| 124 |
url_pattern = parts[2] if len(parts) > 2 and not parts[2].startswith('-') else None
|
| 125 |
handle_captcha = '-captcha' in user_input.lower()
|
| 126 |
+
|
| 127 |
+
website_name = self.get_website_name(url)
|
| 128 |
+
|
| 129 |
+
st.session_state.chat_history[st.session_state.current_chat_id]["name"] = website_name
|
| 130 |
+
|
| 131 |
response = await self._fetch_url(url, pages, url_pattern, handle_captcha)
|
| 132 |
elif not self.current_content:
|
| 133 |
response = "Please provide a URL first before asking for information."
|