Update app.py
Browse files
app.py
CHANGED
|
@@ -41,8 +41,7 @@ st.set_page_config(
|
|
| 41 |
st.title("🕷️ Web Scraping + RAG Chatbot")
|
| 42 |
st.markdown("""
|
| 43 |
This app combines web scraping with Retrieval-Augmented Generation (RAG) to create an intelligent chatbot.
|
| 44 |
-
It can scrape websites, index the content, and answer your
|
| 45 |
-
questions about the scraped content.
|
| 46 |
""")
|
| 47 |
|
| 48 |
# Initialize session state variables
|
|
@@ -90,6 +89,7 @@ def check_versions(chromium_path, chromedriver_path):
|
|
| 90 |
else:
|
| 91 |
logging.error("ChromeDriver binary not found")
|
| 92 |
|
|
|
|
| 93 |
def setup_driver():
|
| 94 |
"""Set up Selenium WebDriver with headless Chromium."""
|
| 95 |
try:
|
|
@@ -165,7 +165,7 @@ def clean_text(text):
|
|
| 165 |
# Remove extra whitespace
|
| 166 |
text = re.sub(r'\s+', ' ', text)
|
| 167 |
# Remove special characters but keep basic punctuation
|
| 168 |
-
text = re.sub(r'[^\w\s.,!?;:]', ' ',
|
| 169 |
return text.strip()
|
| 170 |
|
| 171 |
def scrape_website(url):
|
|
@@ -174,7 +174,7 @@ def scrape_website(url):
|
|
| 174 |
if not driver:
|
| 175 |
return None
|
| 176 |
try:
|
| 177 |
-
|
| 178 |
# Wait for page to load
|
| 179 |
WebDriverWait(driver, 10).until(
|
| 180 |
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
|
@@ -211,6 +211,7 @@ def scrape_website(url):
|
|
| 211 |
driver.quit()
|
| 212 |
st.session_state.driver_initialized = False
|
| 213 |
|
|
|
|
| 214 |
def initialize_qa_model():
|
| 215 |
"""Initialize the QA model if not already loaded."""
|
| 216 |
if st.session_state.qa_pipeline is None:
|
|
@@ -226,10 +227,14 @@ def initialize_qa_model():
|
|
| 226 |
tokenizer=tokenizer,
|
| 227 |
max_length=200
|
| 228 |
)
|
|
|
|
| 229 |
except Exception as e:
|
| 230 |
st.error(f"Failed to load QA model: {str(e)}")
|
| 231 |
logging.error(f"Error loading QA model: {str(e)}")
|
|
|
|
|
|
|
| 232 |
|
|
|
|
| 233 |
def create_vector_store(text):
|
| 234 |
"""Create a FAISS vector store from the scraped text."""
|
| 235 |
try:
|
|
@@ -295,7 +300,7 @@ if app_mode == "Web Scraping":
|
|
| 295 |
with st.spinner("Scraping website..."):
|
| 296 |
result = scrape_website(url)
|
| 297 |
if result:
|
| 298 |
-
st.success(f
|
| 299 |
# Store scraped content
|
| 300 |
st.session_state.scraped_content = result['content']
|
| 301 |
# Create vector store
|
|
@@ -391,5 +396,7 @@ if st.sidebar.checkbox("Show Debug Logs"):
|
|
| 391 |
st.sidebar.text_area("Logs", log_contents, height=300)
|
| 392 |
else:
|
| 393 |
st.sidebar.info("No logs available yet.")
|
|
|
|
|
|
|
| 394 |
except FileNotFoundError:
|
| 395 |
st.sidebar.warning("Log file not found.")
|
|
|
|
| 41 |
st.title("🕷️ Web Scraping + RAG Chatbot")
|
| 42 |
st.markdown("""
|
| 43 |
This app combines web scraping with Retrieval-Augmented Generation (RAG) to create an intelligent chatbot.
|
| 44 |
+
It can scrape websites, index the content, and answer your questions about the scraped content.
|
|
|
|
| 45 |
""")
|
| 46 |
|
| 47 |
# Initialize session state variables
|
|
|
|
| 89 |
else:
|
| 90 |
logging.error("ChromeDriver binary not found")
|
| 91 |
|
| 92 |
+
@st.cache_resource
|
| 93 |
def setup_driver():
|
| 94 |
"""Set up Selenium WebDriver with headless Chromium."""
|
| 95 |
try:
|
|
|
|
| 165 |
# Remove extra whitespace
|
| 166 |
text = re.sub(r'\s+', ' ', text)
|
| 167 |
# Remove special characters but keep basic punctuation
|
| 168 |
+
text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
|
| 169 |
return text.strip()
|
| 170 |
|
| 171 |
def scrape_website(url):
|
|
|
|
| 174 |
if not driver:
|
| 175 |
return None
|
| 176 |
try:
|
| 177 |
+
driver.get(url)
|
| 178 |
# Wait for page to load
|
| 179 |
WebDriverWait(driver, 10).until(
|
| 180 |
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
|
|
|
| 211 |
driver.quit()
|
| 212 |
st.session_state.driver_initialized = False
|
| 213 |
|
| 214 |
+
@st.cache_resource
|
| 215 |
def initialize_qa_model():
|
| 216 |
"""Initialize the QA model if not already loaded."""
|
| 217 |
if st.session_state.qa_pipeline is None:
|
|
|
|
| 227 |
tokenizer=tokenizer,
|
| 228 |
max_length=200
|
| 229 |
)
|
| 230 |
+
return st.session_state.qa_pipeline
|
| 231 |
except Exception as e:
|
| 232 |
st.error(f"Failed to load QA model: {str(e)}")
|
| 233 |
logging.error(f"Error loading QA model: {str(e)}")
|
| 234 |
+
return None
|
| 235 |
+
return st.session_state.qa_pipeline
|
| 236 |
|
| 237 |
+
@st.cache_resource
|
| 238 |
def create_vector_store(text):
|
| 239 |
"""Create a FAISS vector store from the scraped text."""
|
| 240 |
try:
|
|
|
|
| 300 |
with st.spinner("Scraping website..."):
|
| 301 |
result = scrape_website(url)
|
| 302 |
if result:
|
| 303 |
+
st.success(f"Successfully scraped: {result['title']}")
|
| 304 |
# Store scraped content
|
| 305 |
st.session_state.scraped_content = result['content']
|
| 306 |
# Create vector store
|
|
|
|
| 396 |
st.sidebar.text_area("Logs", log_contents, height=300)
|
| 397 |
else:
|
| 398 |
st.sidebar.info("No logs available yet.")
|
| 399 |
+
except PermissionError:
|
| 400 |
+
st.sidebar.error("Cannot read log file due to permission issues.")
|
| 401 |
except FileNotFoundError:
|
| 402 |
st.sidebar.warning("Log file not found.")
|