Scraper_hub / main.py
google-labs-jules[bot]
Deploy CyberScraper 2077 to Hugging Face with Blablador LLM support
e1d311a
import warnings
# Suppress Pydantic V1 warning - upstream LangChain issue with Python 3.14
# See: https://github.com/langchain-ai/langchain/issues/33926
warnings.filterwarnings("ignore", message="Core Pydantic V1 functionality")
import streamlit as st
import streamlit.runtime.scriptrunner_utils.script_run_context as _ctx
_original_get_script_run_ctx = _ctx.get_script_run_ctx
_ctx.get_script_run_ctx = lambda suppress_warning=True: _original_get_script_run_ctx(suppress_warning=suppress_warning)
import json
import asyncio
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
from app.streamlit_web_scraper_chat import StreamlitWebScraperChat
from app.ui_components import display_info_icons, extract_data_from_markdown, format_data
from app.utils import loading_animation
from src.web_extractor import extract_url, get_website_name
from datetime import datetime, timedelta
from src.ollama_models import OllamaModel
from src.utils.error_handler import ErrorMessages
import pandas as pd
import base64
from google_auth_oauthlib.flow import Flow
from io import BytesIO
from src.utils.google_sheets_utils import SCOPES, get_redirect_uri, display_google_sheets_button, initiate_google_auth
from src.scrapers.playwright_scraper import ScraperConfig
import time
import atexit
import logging
logger = logging.getLogger(__name__)
def handle_oauth_callback():
if 'code' in st.query_params:
try:
flow = Flow.from_client_secrets_file(
'client_secret.json',
scopes=SCOPES,
redirect_uri=get_redirect_uri()
)
flow.fetch_token(code=st.query_params['code'])
st.session_state['google_auth_token'] = flow.credentials.to_json()
st.success("Successfully authenticated with Google!")
st.query_params.clear()
except FileNotFoundError:
st.error(ErrorMessages.OAUTH_FAILED)
logger.error("client_secret.json not found")
except Exception as e:
st.error(f"{ErrorMessages.OAUTH_FAILED}\n\nDetails: {str(e)}")
logger.error(f"OAuth error: {str(e)}")
def serialize_bytesio(obj):
if isinstance(obj, BytesIO):
return {
"_type": "BytesIO",
"data": base64.b64encode(obj.getvalue()).decode('utf-8')
}
raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
def deserialize_bytesio(obj):
if isinstance(obj, dict) and "_type" in obj and obj["_type"] == "BytesIO":
return BytesIO(base64.b64decode(obj["data"]))
return obj
def save_chat_history(chat_history):
with open("chat_history.json", "w") as f:
json.dump(chat_history, f, default=serialize_bytesio)
def load_chat_history():
try:
with open("chat_history.json", "r") as f:
return json.load(f, object_hook=deserialize_bytesio)
except FileNotFoundError:
return {}
def safe_process_message(web_scraper_chat, message, conversation_history=None):
if message is None or message.strip() == "":
return "I'm sorry, but I didn't receive any input. Could you please try again?"
try:
progress_placeholder = st.empty()
progress_placeholder.text("Initializing scraper...")
start_time = time.time()
response = web_scraper_chat.process_message(message, conversation_history)
end_time = time.time()
progress_placeholder.text(f"Scraping completed in {end_time - start_time:.2f} seconds.")
# Check for error messages in response
if isinstance(response, str) and ("Error:" in response or "Failed to" in response or "is missing" in response):
st.error(response)
if isinstance(response, tuple):
if len(response) == 2 and isinstance(response[1], pd.DataFrame):
csv_string, df = response
st.dataframe(df)
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)
st.download_button(
label="Download CSV",
data=csv_buffer,
file_name="data.csv",
mime="text/csv"
)
return csv_string
elif len(response) == 2 and isinstance(response[0], BytesIO):
excel_buffer, df = response
st.dataframe(df)
excel_buffer.seek(0)
st.download_button(
label="Download Excel",
data=excel_buffer,
file_name="data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
return ("Excel data displayed and available for download.", excel_buffer)
elif isinstance(response, pd.DataFrame):
st.dataframe(response)
csv_buffer = BytesIO()
response.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)
st.download_button(
label="Download CSV",
data=csv_buffer,
file_name="data.csv",
mime="text/csv"
)
return "DataFrame displayed and available for download as CSV."
return response
except ValueError as e:
# Handle API key errors specifically
error_msg = str(e)
if "API Key" in error_msg or "missing" in error_msg.lower():
st.error(error_msg)
else:
st.error(f"{ErrorMessages.SCRAPING_FAILED}\n\nDetails: {error_msg}")
logger.error(f"ValueError during processing: {error_msg}")
return error_msg
except Exception as e:
st.error(f"{ErrorMessages.GENERIC_ERROR}\n\nDetails: {str(e)}")
logger.error(f"Unexpected error during processing: {str(e)}")
return f"{ErrorMessages.GENERIC_ERROR}\n\nDetails: {str(e)}"
def get_date_group(date_str):
date = datetime.strptime(date_str, "%Y-%m-%d")
today = datetime.now().date()
if date.date() == today:
return "Today"
elif date.date() == today - timedelta(days=1):
return "Yesterday"
elif date.date() > today - timedelta(days=7):
return date.strftime("%A")
else:
return date.strftime("%B %d, %Y")
def get_last_url_from_chat(messages):
for message in reversed(messages):
if message['role'] == 'user':
url = extract_url(message['content'])
if url:
return url
return None
def initialize_web_scraper_chat(url=None):
if st.session_state.selected_model.startswith("ollama:"):
model = st.session_state.selected_model
else:
model = st.session_state.selected_model
scraper_config = ScraperConfig(
use_current_browser=st.session_state.use_current_browser,
headless=not st.session_state.use_current_browser,
max_retries=3,
delay_after_load=5,
debug=True,
wait_for='domcontentloaded'
)
try:
web_scraper_chat = StreamlitWebScraperChat(model_name=model, scraper_config=scraper_config)
if url:
web_scraper_chat.process_message(url)
website_name = get_website_name(url)
st.session_state.chat_history[st.session_state.current_chat_id]["name"] = website_name
return web_scraper_chat
except ValueError as e:
# Handle API key errors
st.error(str(e))
return None
except Exception as e:
st.error(f"{ErrorMessages.GENERIC_ERROR}\n\nDetails: {str(e)}")
logger.error(f"Error initializing web scraper: {str(e)}")
return None
async def list_ollama_models():
try:
return await OllamaModel.list_models()
except Exception as e:
logger.warning(f"Error fetching Ollama models: {str(e)}")
# Don't show error to user, just return empty list
# The warning in the sidebar will guide users
return []
def load_css():
with open("app/styles.css", "r") as f:
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
@st.cache_data
def get_image_base64(image_path: str) -> str:
"""Get base64 encoded image with caching to avoid re-encoding on every render."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode()
def check_service_status() -> dict:
"""Check the status of all services and return a dict with their status."""
status = {
"openai": {
"name": "OpenAI",
"configured": bool(os.getenv("OPENAI_API_KEY")),
"env_var": "OPENAI_API_KEY"
},
"gemini": {
"name": "Gemini",
"configured": bool(os.getenv("GOOGLE_API_KEY")),
"env_var": "GOOGLE_API_KEY"
},
"blablador": {
"name": "Blablador",
"configured": bool(os.getenv("BLABLADOR_API_KEY")),
"env_var": "BLABLADOR_API_KEY"
},
"tor": {
"name": "Tor",
"configured": False, # Will be checked dynamically
"env_var": None
},
"google_sheets": {
"name": "Google Sheets",
"configured": os.path.exists("client_secret.json"),
"env_var": "client_secret.json"
}
}
# Check Tor status by checking if port 9050 is open
import socket
def is_tor_running():
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(1)
result = sock.connect_ex(('127.0.0.1', 9050))
sock.close()
return result == 0
except Exception:
return False
status["tor"]["configured"] = is_tor_running()
return status
def display_service_status():
"""Display service status with checkmarks/crosses in the sidebar."""
status = check_service_status()
# Inject CSS styles
st.markdown("""
<style>
div[data-testid="stSidebar"] > div:first-child {
overflow: visible !important;
}
.service-status {
display: flex;
align-items: center;
margin: 1px 0;
}
.status-icon {
width: 25px;
font-size: 18px;
text-align: center;
margin-right: 8px;
}
.status-icon-check {
color: #28a745;
}
.status-icon-cross {
color: #dc3545;
}
.status-text {
flex: 1;
font-size: 14px;
}
.status-env {
font-size: 11px;
color: #6c757d;
margin-left: 4px;
}
</style>
""", unsafe_allow_html=True)
st.markdown("### Setup Status")
for key, info in status.items():
if info["configured"]:
icon_html = f'<span class="status-icon status-icon-check">✓</span>'
env_html = ""
else:
icon_html = f'<span class="status-icon status-icon-cross">✗</span>'
if info["env_var"]:
env_html = f'<span class="status-env">({info["env_var"]})</span>'
else:
env_html = '<span class="status-env">(Tor not running)</span>'
html = f"""
<div class="service-status">
{icon_html}
<span class="status-text">{info["name"]}</span>
{env_html}
</div>
"""
st.markdown(html, unsafe_allow_html=True)
# Show setup help if any service is missing
missing_services = [key for key, info in status.items() if not info["configured"]]
if missing_services:
st.markdown("---")
st.markdown("""<p style="margin: 0; padding: 0; line-height: 1.4;"><strong>Setup Help:</strong><br>
See <a href="https://github.com/itsOwen/CyberScraper-2077/blob/main/README.md">README</a> for configuration instructions.</p>""", unsafe_allow_html=True)
def render_message(role, content, avatar_path):
message_class = "user-message" if role == "user" else "assistant-message"
avatar_base64 = get_image_base64(avatar_path)
return f"""
<div class="chat-message {message_class}">
<div class="avatar">
<img src="data:image/png;base64,{avatar_base64}" alt="{role} avatar">
</div>
<div class="message-content">{content}</div>
</div>
"""
def display_message_with_sheets_upload(message, message_index):
content = message["content"]
if isinstance(content, (str, bytes, BytesIO)):
data = extract_data_from_markdown(content)
if data is not None:
try:
is_excel = isinstance(data, BytesIO) or (isinstance(content, str) and 'excel' in content.lower())
if is_excel:
df = format_data(data, 'excel')
else:
df = format_data(data, 'csv')
if df is not None:
st.dataframe(df)
if not is_excel:
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)
st.download_button(
label="📥 Download as CSV",
data=csv_buffer,
file_name="data.csv",
mime="text/csv",
key=f"csv_download_{message_index}"
)
else:
excel_buffer = BytesIO()
with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
df.to_excel(writer, index=False, sheet_name='Sheet1')
excel_buffer.seek(0)
st.download_button(
label="📥 Download as Excel",
data=excel_buffer,
file_name="data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
key=f"excel_download_{message_index}"
)
display_google_sheets_button(df, f"sheets_upload_{message_index}")
else:
st.warning("Failed to display data as a table. Showing raw content:")
st.code(content)
except Exception as e:
st.error(f"Error processing data: {str(e)}")
st.code(content)
else:
st.markdown(content)
else:
st.markdown(str(content))
def cleanup():
"""Clean up resources on exit."""
try:
if 'web_scraper_chat' in st.session_state and st.session_state.web_scraper_chat:
del st.session_state.web_scraper_chat
except Exception:
pass # Ignore errors during cleanup
atexit.register(cleanup)
def main():
st.set_page_config(
page_title="CyberScraper 2077",
page_icon="app/icons/radiation.png",
layout="wide"
)
load_css()
handle_oauth_callback()
# avatar paths
user_avatar_path = "app/icons/man.png"
ai_avatar_path = "app/icons/skull.png"
if 'chat_history' not in st.session_state:
st.session_state.chat_history = load_chat_history()
if 'current_chat_id' not in st.session_state or st.session_state.current_chat_id not in st.session_state.chat_history:
if st.session_state.chat_history:
st.session_state.current_chat_id = next(iter(st.session_state.chat_history))
else:
new_chat_id = str(datetime.now().timestamp())
st.session_state.chat_history[new_chat_id] = {
"messages": [],
"date": datetime.now().strftime("%Y-%m-%d")
}
st.session_state.current_chat_id = new_chat_id
save_chat_history(st.session_state.chat_history)
if 'selected_model' not in st.session_state:
st.session_state.selected_model = "alias-fast"
if 'web_scraper_chat' not in st.session_state:
st.session_state.web_scraper_chat = None
with st.sidebar:
st.title("CyberScraper-2077")
# Model selection
st.subheader("Select Model")
default_models = ["alias-fast", "alias-large", "gpt-4o-mini", "gemini-1.5-flash"]
ollama_models = st.session_state.get('ollama_models', [])
all_models = default_models + [f"ollama:{model}" for model in ollama_models]
selected_model = st.selectbox("Choose a model", all_models, index=all_models.index(st.session_state.selected_model) if st.session_state.selected_model in all_models else 0)
if selected_model != st.session_state.selected_model:
st.session_state.selected_model = selected_model
st.session_state.web_scraper_chat = None
st.rerun()
# Display service status with checkmarks/crosses
display_service_status()
st.markdown("---")
st.session_state.use_current_browser = st.checkbox("Use Current Browser (No Docker)", value=False, help="Works Natively, Doesn't Work with Docker. if a website is blocking your browser, you can use this option to use the current browser instead of opening a new one.")
if st.button("Refresh Ollama Models"):
with st.spinner("Fetching Ollama models..."):
st.session_state.ollama_models = asyncio.run(list_ollama_models())
st.success(f"Found {len(st.session_state.ollama_models)} Ollama models")
st.rerun()
if st.button("+ 🗨️ New Chat", key="new_chat", use_container_width=True):
new_chat_id = str(datetime.now().timestamp())
st.session_state.chat_history[new_chat_id] = {
"messages": [],
"date": datetime.now().strftime("%Y-%m-%d"),
"name": "🗨️ New Chat"
}
st.session_state.current_chat_id = new_chat_id
st.session_state.web_scraper_chat = None
save_chat_history(st.session_state.chat_history)
st.rerun()
grouped_chats = {}
for chat_id, chat_data in st.session_state.chat_history.items():
date_group = get_date_group(chat_data['date'])
if date_group not in grouped_chats:
grouped_chats[date_group] = []
grouped_chats[date_group].append((chat_id, chat_data))
for date_group, chats in grouped_chats.items():
st.markdown(f"<div class='date-group'>{date_group}</div>", unsafe_allow_html=True)
for chat_id, chat_data in chats:
button_label = chat_data.get('name', "🗨️ Unnamed Chat")
col1, col2 = st.columns([0.78, 0.22])
with col1:
if st.button(button_label, key=f"history_{chat_id}", use_container_width=True):
st.session_state.current_chat_id = chat_id
messages = chat_data['messages']
last_url = get_last_url_from_chat(messages)
if last_url and not st.session_state.web_scraper_chat:
st.session_state.web_scraper_chat = initialize_web_scraper_chat(last_url)
st.rerun()
with col2:
if st.button("🗑️", key=f"delete_{chat_id}"):
del st.session_state.chat_history[chat_id]
save_chat_history(st.session_state.chat_history)
if st.session_state.current_chat_id == chat_id:
if st.session_state.chat_history:
st.session_state.current_chat_id = next(iter(st.session_state.chat_history))
else:
st.session_state.current_chat_id = None
st.session_state.web_scraper_chat = None
st.rerun()
st.markdown(
"""
<h1 style="text-align: center; font-size: 30px; color: #333;">CyberScraper 2077</h1>
""",
unsafe_allow_html=True
)
display_info_icons()
if st.session_state.current_chat_id not in st.session_state.chat_history:
if st.session_state.chat_history:
st.session_state.current_chat_id = next(iter(st.session_state.chat_history))
else:
new_chat_id = str(datetime.now().timestamp())
st.session_state.chat_history[new_chat_id] = {
"messages": [],
"date": datetime.now().strftime("%Y-%m-%d")
}
st.session_state.current_chat_id = new_chat_id
save_chat_history(st.session_state.chat_history)
chat_container = st.container()
with chat_container:
st.markdown('<div class="chat-container">', unsafe_allow_html=True)
for index, message in enumerate(st.session_state.chat_history[st.session_state.current_chat_id]["messages"]):
if message["role"] == "user":
st.markdown(render_message("user", message["content"], user_avatar_path), unsafe_allow_html=True)
else:
with st.container():
st.markdown(render_message("assistant", "", ai_avatar_path), unsafe_allow_html=True)
display_message_with_sheets_upload(message, index)
st.markdown('</div>', unsafe_allow_html=True)
prompt = st.chat_input("Enter the URL to scrape or ask a question regarding the data", key="user_input")
if prompt:
st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "user", "content": prompt})
if not st.session_state.web_scraper_chat:
st.session_state.web_scraper_chat = initialize_web_scraper_chat()
url = extract_url(prompt)
if url:
website_name = get_website_name(url)
st.session_state.chat_history[st.session_state.current_chat_id]["name"] = website_name
with st.chat_message("assistant"):
try:
# Get current chat messages for conversation context
chat_messages = st.session_state.chat_history[st.session_state.current_chat_id]["messages"]
full_response = loading_animation(
safe_process_message,
st.session_state.web_scraper_chat,
prompt,
chat_messages
)
if isinstance(full_response, str) and not full_response.startswith("Error:"):
st.success("Scraping completed successfully!")
if full_response is not None:
if isinstance(full_response, tuple) and len(full_response) == 2 and isinstance(full_response[1], BytesIO):
st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response[0]})
else:
st.session_state.chat_history[st.session_state.current_chat_id]["messages"].append({"role": "assistant", "content": full_response})
save_chat_history(st.session_state.chat_history)
except Exception as e:
st.error(f"An unexpected error occurred: {str(e)}")
save_chat_history(st.session_state.chat_history)
st.rerun()
st.markdown(
"""
<p style="text-align: center; font-size: 12px; color: #666666;">CyberScraper 2077 can make mistakes sometimes. Report any issues to the developers.</p>
""",
unsafe_allow_html=True
)
if __name__ == "__main__":
main()