MahatirTusher commited on
Commit
9c79583
Β·
verified Β·
1 Parent(s): d6cec65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -1
app.py CHANGED
@@ -8,6 +8,7 @@ import os
8
  import time
9
  from langchain_groq import ChatGroq
10
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
 
11
 
12
  # Load environment variables (optional)
13
  load_dotenv()
@@ -116,9 +117,11 @@ if process_url_clicked:
116
  with st.spinner("Processing URL..."):
117
  try:
118
  st.text("Data Loading...Started...βœ…βœ…βœ…")
 
 
119
  loader = WebBaseLoader(
120
  web_paths=[url.strip()],
121
- bs_kwargs={"parse_only": ["title", "p", "h1", "h2", "h3"]},
122
  requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}}
123
  )
124
  data = loader.load()
 
8
  import time
9
  from langchain_groq import ChatGroq
10
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
11
+ from bs4 import SoupStrainer
12
 
13
  # Load environment variables (optional)
14
  load_dotenv()
 
117
  with st.spinner("Processing URL..."):
118
  try:
119
  st.text("Data Loading...Started...βœ…βœ…βœ…")
120
+ # Use SoupStrainer to specify tags to parse
121
+ parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
122
  loader = WebBaseLoader(
123
  web_paths=[url.strip()],
124
+ bs_kwargs={"parse_only": parse_only},
125
  requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}}
126
  )
127
  data = loader.load()