Tarun Singh commited on
Commit
d1c030f
·
1 Parent(s): d651660
Files changed (3) hide show
  1. .gitignore +7 -0
  2. app.py +81 -0
  3. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ *.env
4
+ *.env*
5
+ env/
6
+ venv/
7
+ .DS_Store
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import streamlit as st
4
+ from bs4 import BeautifulSoup
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chains import RetrievalQA
8
+ from langchain_core.documents import Document
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ from langchain_huggingface import HuggingFaceEmbeddings
11
+
12
+ my_openai_key = os.getenv("OPENAI").split(',')
13
+
14
+ # --- Web Scraping Functions ---
15
+ def fetch_and_extract(url: str) -> str:
16
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}
17
+ try:
18
+ response = requests.get(url, headers=headers, timeout=15)
19
+ response.raise_for_status()
20
+ except requests.RequestException as e:
21
+ st.error(f"Error fetching {url}: {e}")
22
+ return ""
23
+
24
+ soup = BeautifulSoup(response.text, "html.parser")
25
+ for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
26
+ element.decompose()
27
+ return soup.get_text(separator=" ", strip=True)
28
+
29
+ @st.cache_resource
30
+ def load_embeddings():
31
+ return HuggingFaceEmbeddings(
32
+ model_name= "sentence-transformers/all-MiniLM-L6-v2",
33
+ # model_kwargs={'device': 'cpu'}
34
+ )
35
+
36
+ @st.cache_resource
37
+ def load_llm():
38
+ return ChatOpenAI(model='gpt-4o', temperature=1.5, max_completion_tokens=100, api_key=my_openai_key)
39
+
40
+ def setup_qa_pipeline(text: str):
41
+ text_splitter = CharacterTextSplitter(
42
+ separator=".",
43
+ chunk_size=500,
44
+ chunk_overlap=100,
45
+ )
46
+ texts = text_splitter.split_text(text)
47
+ docs = [Document(page_content=t) for t in texts]
48
+
49
+ vectorstore = FAISS.from_documents(
50
+ docs,
51
+ embedding=load_embeddings()
52
+ )
53
+
54
+ return RetrievalQA.from_chain_type(
55
+ llm=load_llm(),
56
+ chain_type="stuff",
57
+ retriever=vectorstore.as_retriever()
58
+ )
59
+
60
+ st.title("Web Content Q&A Tool")
61
+
62
+ with st.form("qa_form"):
63
+ urls = st.text_area("Enter URLs (one per line):", height=100)
64
+ question = st.text_input("Question:")
65
+ submitted = st.form_submit_button("Get Answer")
66
+
67
+ if submitted:
68
+ if not (urls and question):
69
+ st.warning("Please provide both URLs and a question")
70
+ else:
71
+ url_list = [url.strip() for url in urls.splitlines() if url.strip()]
72
+ with st.spinner("Analyzing content..."):
73
+ content = "\n".join(fetch_and_extract(url) for url in url_list)
74
+ if not content:
75
+ st.error("Failed to retrieve content from URLs")
76
+ st.stop()
77
+
78
+ qa_chain = setup_qa_pipeline(content)
79
+ answer = qa_chain.invoke("Answer this question in minimum words"+question)
80
+ # st.subheader()
81
+ st.write(answer['result'])
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ requests
2
+ streamlit
3
+ beautifulsoup4
4
+ langchain
5
+ faiss-cpu
6
+ langchain_huggingface