Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import os
|
|
| 8 |
import time
|
| 9 |
from langchain_groq import ChatGroq
|
| 10 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
|
|
|
| 11 |
|
| 12 |
# Load environment variables (optional)
|
| 13 |
load_dotenv()
|
|
@@ -116,9 +117,11 @@ if process_url_clicked:
|
|
| 116 |
with st.spinner("Processing URL..."):
|
| 117 |
try:
|
| 118 |
st.text("Data Loading...Started...β
β
β
")
|
|
|
|
|
|
|
| 119 |
loader = WebBaseLoader(
|
| 120 |
web_paths=[url.strip()],
|
| 121 |
-
bs_kwargs={"parse_only":
|
| 122 |
requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}}
|
| 123 |
)
|
| 124 |
data = loader.load()
|
|
|
|
| 8 |
import time
|
| 9 |
from langchain_groq import ChatGroq
|
| 10 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
| 11 |
+
from bs4 import SoupStrainer
|
| 12 |
|
| 13 |
# Load environment variables (optional)
|
| 14 |
load_dotenv()
|
|
|
|
| 117 |
with st.spinner("Processing URL..."):
|
| 118 |
try:
|
| 119 |
st.text("Data Loading...Started...β
β
β
")
|
| 120 |
+
# Use SoupStrainer to specify tags to parse
|
| 121 |
+
parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
|
| 122 |
loader = WebBaseLoader(
|
| 123 |
web_paths=[url.strip()],
|
| 124 |
+
bs_kwargs={"parse_only": parse_only},
|
| 125 |
requests_kwargs={"headers": {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}}
|
| 126 |
)
|
| 127 |
data = loader.load()
|