muddasser commited on
Commit
7229d87
·
verified ·
1 Parent(s): 2e1fc9a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from selenium import webdriver
3
+ from selenium.webdriver.chrome.service import Service
4
+ from webdriver_manager.chrome import ChromeDriverManager
5
+ from selenium.webdriver.chrome.options import Options
6
+ import time
7
+
8
+ from sentence_transformers import SentenceTransformer
9
+ import faiss
10
+ import numpy as np
11
+ from transformers import pipeline
12
+
13
+ # -------------------------------
14
+ # 1. Setup Selenium (Headless Chrome for Hugging Face/Streamlit)
15
+ # -------------------------------
16
+ def init_driver():
17
+ chrome_options = Options()
18
+ chrome_options.add_argument("--headless")
19
+ chrome_options.add_argument("--disable-gpu")
20
+ chrome_options.add_argument("--no-sandbox")
21
+ chrome_options.add_argument("--disable-dev-shm-usage")
22
+
23
+ service = Service(ChromeDriverManager().install())
24
+ driver = webdriver.Chrome(service=service, options=chrome_options)
25
+ return driver
26
+
27
+ # -------------------------------
28
+ # 2. Scrape website text with Selenium
29
+ # -------------------------------
30
+ def scrape_website(url):
31
+ driver = init_driver()
32
+ driver.get(url)
33
+ time.sleep(3) # wait for JS to load
34
+ text = driver.page_source # raw HTML
35
+ driver.quit()
36
+ return text
37
+
38
+ # -------------------------------
39
+ # 3. Embed and store in FAISS
40
+ # -------------------------------
41
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
42
+ dimension = 384
43
+ index = faiss.IndexFlatL2(dimension)
44
+
45
+ documents = []
46
+
47
+ def add_to_faiss(text):
48
+ global documents
49
+ embedding = embedder.encode([text])
50
+ index.add(np.array(embedding, dtype="float32"))
51
+ documents.append(text)
52
+
53
+ def retrieve(query, k=1):
54
+ q_emb = embedder.encode([query])
55
+ D, I = index.search(np.array(q_emb, dtype="float32"), k)
56
+ return [documents[i] for i in I[0]]
57
+
58
+ # -------------------------------
59
+ # 4. QA Model (FLAN-T5-small)
60
+ # -------------------------------
61
+ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
62
+
63
+ def answer_query(query):
64
+ context_docs = retrieve(query, k=1)
65
+ context = " ".join(context_docs)
66
+ prompt = f"Answer the question based on context:\nContext: {context}\nQuestion: {query}"
67
+ result = qa_pipeline(prompt, max_length=256, do_sample=False)
68
+ return result[0]['generated_text']
69
+
70
+ # -------------------------------
71
+ # 5. Streamlit App
72
+ # -------------------------------
73
+ st.title("🌐 Web Scraping + RAG (Selenium + FLAN-T5-small)")
74
+
75
+ url = st.text_input("Enter website URL:")
76
+ if url and st.button("Scrape & Index"):
77
+ scraped_text = scrape_website(url)
78
+ add_to_faiss(scraped_text)
79
+ st.success("✅ Website scraped and indexed successfully!")
80
+
81
+ query = st.text_input("Ask a question:")
82
+ if query and st.button("Get Answer"):
83
+ answer = answer_query(query)
84
+ st.write("**Answer:**", answer)