File size: 2,778 Bytes
d1c030f e601217 d1c030f e601217 d1c030f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import os
import requests
import streamlit as st
from bs4 import BeautifulSoup
from langchain_openai import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_core.documents import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
my_openai_key = os.getenv("OPENAI")
# --- Web Scraping Functions ---
def fetch_and_extract(url: str) -> str:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
except requests.RequestException as e:
st.error(f"Error fetching {url}: {e}")
return ""
soup = BeautifulSoup(response.text, "html.parser")
for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
element.decompose()
return soup.get_text(separator=" ", strip=True)
@st.cache_resource
def load_embeddings():
return HuggingFaceEmbeddings(
model_name= "sentence-transformers/all-MiniLM-L6-v2",
# model_kwargs={'device': 'cpu'}
)
@st.cache_resource
def load_llm():
return ChatOpenAI(model='gpt-4o', temperature=0.7, max_completion_tokens=100, api_key=my_openai_key)
def setup_qa_pipeline(text: str):
text_splitter = CharacterTextSplitter(
separator=".",
chunk_size=500,
chunk_overlap=100,
)
texts = text_splitter.split_text(text)
docs = [Document(page_content=t) for t in texts]
vectorstore = FAISS.from_documents(
docs,
embedding=load_embeddings()
)
return RetrievalQA.from_chain_type(
llm=load_llm(),
chain_type="stuff",
retriever=vectorstore.as_retriever()
)
st.title("Web Content Q&A Tool")
with st.form("qa_form"):
urls = st.text_area("Enter URLs (one per line):", height=100)
question = st.text_input("Question:")
submitted = st.form_submit_button("Get Answer")
if submitted:
if not (urls and question):
st.warning("Please provide both URLs and a question")
else:
url_list = [url.strip() for url in urls.splitlines() if url.strip()]
with st.spinner("Analyzing content..."):
content = "\n".join(fetch_and_extract(url) for url in url_list)
if not content:
st.error("Failed to retrieve content from URLs")
st.stop()
qa_chain = setup_qa_pipeline(content)
answer = qa_chain.invoke("Answer this question in minimum words"+question)
# st.subheader()
st.write(answer['result'])
|